bollux/transform_uri.sh

#!/usr/bin/env bash
# transform-url
# cf. https://tools.ietf.org/html/rfc3986#section-5 and
# cf. https://tools.ietf.org/html/rfc3986#section-5.1
# cf. also https://tools.ietf.org/html/rfc3986#appendix-B -- regex

# TEST WITH https://tools.ietf.org/html/rfc3986#section-5.4

transform_resource() { # 5.2.2
	declare -A R B T           # reference, base url, target
	eval "$(parse_url R "$2")" # XXX CHANGE
	eval "$(parse_url B "$1")"
	# Basically going to follow the pseudocode in the spec.
	# the '+x' bit after the fields of the arrays tests if they're set
	if [[ "${R['scheme']+x}" ]]; then
		T['scheme']="${R['scheme']}"
		T['authority']="${R['authority']}"
		T['path']="$(remove_dot_segments "${R['path']}")"
		T['query']="${R['query']}"
	else
		if [[ "${R['authority']+x}" ]]; then
			T['authority']="${R['authority']}"
			T['path']="$(remove_dot_segments "${R['path']}")"
			T['query']="${R['query']}"
		else
			if [[ "${R['path']-x}" == "" ]]; then
				T['path']="${B['path']}"
				if [[ "${R['query']-x}" ]]; then
					T['query']="${R['query']}"
				else
					T['query']="${B['query']}"
				fi
			else
				if [[ "${R['path']}" == /* ]]; then
					T['path']="$(remove_dot_segments "${R['path']}")"
				else
					T['path']="$(merge "${B['authority']-?}" \
						"${B['path']}" "${R['path']}")"
					T['path']="$(remove_dot_segments "${T['path']}")"
				fi
				T['query']="${R['query']}"
			fi
			T['authority']="${B['authority']}"
		fi
		T['scheme']="${B['scheme']}"
	fi
	T['fragment']="${R['fragment']}"
	# 5.3 -- recomposition
	local r=""
	[[ "${T['scheme']-x}" ]] &&
		r="$r${T['scheme']}:"
	[[ "${T['authority']-x}" ]] &&
		r="$r//${T['authority']}"
	r="$r${T['path']}"
	[[ "${T['query']-x}" ]] &&
		r="$r?${T['query']}"
	[[ "${T['fragment']-x}" ]] &&
		r="$r#${T['fragment']}"
	printf '%s\n' "$r"
}

merge() { # 5.2.3
	#>If the base URI has a defined authority component and an empty
	#>path, then return a string consisting of "/" concatenated with the
	#>reference's path; otherwise,
	#>return a string consisting of the reference's path component
	#>appended to all but the last segment of the base URI's path (i.e.,
	#>excluding any characters after the right-most "/" in the base URI
	#>path, or excluding the entire base URI path if it does not contain
	#>any "/" characters).
	B_authority="$1" # if ? is here, it means undefined (see caller)
	B_path="$2"
	R_path="$3"
	if [[ -z "$R_path" ]]; then
		printf '%q\n' "$B_path" |
			sed 's,//,/,g' # XXX is this okay....?
		return
	fi

	if [[ "${B_authority:-?}" != "?" && "${B_path-x}" == "" ]]; then
		printf '/%q\n' "$R_path"
	else
		if [[ "$B_path" == */* ]]; then
			B_path="${B_path%/*}/"
		else
			B_path=""
		fi
		printf '%q/%q\n' "$B_path" "$R_path" # XXX - %q vs %s
	fi
}

# I can probably just use normalize_path already in bollux here
remove_dot_segments() { # 5.2.4
	local input="$1"
	local output=
	while [[ -n "$input" ]]; do
		if [[ "$input" == ../* || "$input" == ./* ]]; then
			input="${input#*/}"
		elif [[ "$input" == /./* ]]; then
			input="${input#/./}/"
		elif [[ "$input" == /.* ]]; then
			input="${input#/.}/b"
		elif [[ "$input" == /../* ]]; then
			input="${input#/../}/c"
			output="${output%/*}"
		elif [[ "$input" == /..* ]]; then
			input="${input#/..}/d"
			output="${output%/*}"
		elif [[ "$input" == . || "$input" == .. ]]; then
			input=
		else
			# move the first path segment in the input buffer to the end of
			# the output buffer, including the initial "/" character (if
			# any) and any subsequent characters up to, but not including,
			# the next "/" character or the end of the input buffer.
			[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
			output="$output${BASH_REMATCH[1]}"
			input="${BASH_REMATCH[2]}"
		fi
	done
	printf '%s\n' "$output" |
		sed 's,//,/,g' # XXX is this okay....?
}

# *FINDING* URLS ... IN PURE BASH !!!
parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
	local name="$1"
	local string="$2"
	local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
	[[ $string =~ $re ]] || return $?

	local scheme="${BASH_REMATCH[2]}"
	local authority="${BASH_REMATCH[4]}"
	local path="${BASH_REMATCH[5]}"
	local query="${BASH_REMATCH[7]}"
	local fragment="${BASH_REMATCH[9]}"

	for c in scheme authority path query fragment; do
		[[ "${!c}" ]] &&
			printf '%s[%s]=%s\n' "$name" "$c" "${!c}" |
			sed 's/[\|&;()<>]/\\&/g' # quote shell metacharacters
	done
}

# ease-of-life functions
isdefined() { # isdefined NAME => tests if NAME is defined ONLY
	[[ "${!1+x}" ]]
}
isempty() { # isempty NAME => tests if NAME is empty ONLY
	[[ ! "${!1-x}" ]]
}

set -x
transform_resource "$@"

# NEXT ....
# NORMALIZATION !!!