Bring URL transforming up to spec

URL transformation (formerly called "munging") is now (afaict) fully
compliant with the RFC spec. It's also implemented in pure bash and is
available as a library at https://git.sr.ht/~acdw/shurlie.
This commit is contained in:
Case Duckworth 2020-05-30 13:38:18 -05:00
parent 482e2659ae
commit c62e428c16
1 changed files with 120 additions and 82 deletions

202
bollux
View File

@ -122,7 +122,7 @@ blastoff() { # load a url
URL="$1"
if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then
URL="$(run munge_url "$1" "$BOLLUX_URL")"
URL="$(run transform_resource "$BOLLUX_URL" "$1")"
fi
[[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL"
URL="$(trim <<<"$URL")"
@ -134,95 +134,133 @@ blastoff() { # load a url
run handle_response "$URL"
}
munge_url() {
local -A new old u
eval "$(split_url new <<<"$1")"
for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done
eval "$(split_url old <<<"$2")"
for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done
u['scheme']="${new['scheme']:-${old['scheme']:-}}"
u['authority']="${new['authority']:-${old['authority']:-}}"
# XXX this whole path thing is wack
if [[ "${new['path']+isset}" ]]; then
log d 'new path set'
if [[ "${new['path']}" == /* ]]; then
log d 'new path == /*'
u['path']="${new['path']}"
elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then
p="${old['path']:-}/${new['path']}"
log d "$p ( $(normalize_path <<<"$p") )"
u['path']="$(normalize_path <<<"$p")"
else
log d 'u path = new path'
u['path']="${new['path']}"
fi
elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then
log d 'u path = old path'
u['path']="${old['path']}"
else
u['path']="/"
transform_resource() { # transform_resource BASE_URL REFERENCE_URL
declare -A R B T # reference, base url, target
eval "$(parse_url B "$1")"
eval "$(parse_url R "$2")"
# A non-strict parser may ignore a scheme in the reference
# if it is identical to the base URI's scheme.
if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then
unset "${R[scheme]}"
fi
u['query']="${new['query']:-}"
u['fragment']="${new['fragment']:-}"
for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done
run printf '%s%s%s%s%s\n' \
"${u['scheme']}" "${u['authority']}" "${u['path']}" \
"${u['query']}" "${u['fragment']}"
# basically pseudo-code from spec ported to bash
if isdefined "R[scheme]"; then
T[scheme]="${R[scheme]}"
isdefined "R[authority]" && T[authority]="${R[authority]}"
isdefined R[path] &&
T[path]="$(remove_dot_segments "${R[path]}")"
isdefined "R[query]" && T[query]="${R[query]}"
else
if isdefined "R[authority]"; then
T[authority]="${R[authority]}"
isdefined "R[authority]" &&
T[path]="$(remove_dot_segments "${R[path]}")"
isdefined R[query] && T[query]="${R[query]}"
else
if isempty "R[path]"; then
T[path]="${B[path]}"
if isdefined R[query]; then
T[query]="${R[query]}"
else
T[query]="${B[query]}"
fi
else
if [[ "${R[path]}" == /* ]]; then
T[path]="$(remove_dot_segments "${R[path]}")"
else
T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")"
T[path]="$(remove_dot_segments "${T[path]}")"
fi
isdefined R[query] && T[query]="${R[query]}"
fi
T[authority]="${B[authority]}"
fi
T[scheme]="${B[scheme]}"
fi
isdefined R[fragment] && T[fragment]="${R[fragment]}"
# cf. 5.3 -- recomposition
local r=""
isdefined "T[scheme]" && r="$r${T[scheme]}:"
isdefined "T[authority]" && r="$r//${T[authority]}"
r="$r${T[path]}"
isdefined T[query] && r="$r?${T[query]}"
isdefined T[fragment] && r="$r#${T[fragment]}"
printf '%s\n' "$r"
}
normalize_path() {
gawk '{
split($0, path, /\//)
for (c in path) {
if (path[c] == "" || path[c] == ".") {
continue
}
if (path[c] == "..") {
sub(/[^\/]+$/, "", ret)
continue
}
if (! ret || match(ret, /\/$/)) {
slash = ""
} else {
slash = "/"
}
ret = ret slash path[c]
}
print (ret ~ /^\// ? "" : "/") ret
}'
merge_paths() { # 5.2.3
# shellcheck disable=2034
B_authority="$1"
B_path="$2"
R_path="$3"
# if R_path is empty, get rid of // in B_path
if [[ -z "$R_path" ]]; then
printf '%s\n' "${B_path//\/\//\//}"
return
fi
if isdefined "B_authority" && isempty "B_path"; then
printf '/%s\n' "${R_path//\/\//\//}"
else
if [[ "$B_path" == */* ]]; then
B_path="${B_path%/*}/"
else
B_path=""
fi
printf '%s/%s\n' "${B_path%/}" "${R_path#/}"
fi
}
split_url() {
gawk -vvar="$1" '{
if (match($0, /^[A-Za-z]+:/)) {
arr["scheme"] = substr($0, RSTART, RLENGTH)
$0 = substr($0, RLENGTH + 1)
}
if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) {
arr["authority"] = substr($0, RSTART, RLENGTH)
$0 = substr($0, RLENGTH + 1)
}
if (match($0, /^\/?[^?#]+/)) {
arr["path"] = substr($0, RSTART, RLENGTH)
$0 = substr($0, RLENGTH + 1)
}
if (match($0, /^\?[^#]+/)) {
arr["query"] = substr($0, RSTART, RLENGTH)
$0 = substr($0, RLENGTH + 1)
}
if (match($0, /^#.*/)) {
arr["fragment"] = substr($0, RSTART, RLENGTH)
$0 = substr($0, RLENGTH + 1)
}
for (part in arr) {
sub(/[[:space:]]+$/, "", arr[part])
printf var "[\"%s\"]=\"%s\"\n", part, arr[part]
}
}'
remove_dot_segments() { # 5.2.4
local input="$1"
local output=
# ^/\.(/|$) - BASH_REMATCH[0]
while [[ "$input" ]]; do
if [[ "$input" =~ ^\.\.?/ ]]; then
input="${input#${BASH_REMATCH[0]}}"
elif [[ "$input" =~ ^/\.(/|$) ]]; then
input="/${input#${BASH_REMATCH[0]}}"
elif [[ "$input" =~ ^/\.\.(/|$) ]]; then
input="/${input#${BASH_REMATCH[0]}}"
[[ "$output" =~ /?[^/]+$ ]]
output="${output%${BASH_REMATCH[0]}}"
elif [[ "$input" == . || "$input" == .. ]]; then
input=
else
[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
output="$output${BASH_REMATCH[1]}"
input="${BASH_REMATCH[2]}"
fi
done
printf '%s\n' "${output//\/\//\//}"
}
parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
local name="$1"
local string="$2"
local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
[[ $string =~ $re ]] || return $?
local scheme="${BASH_REMATCH[2]}"
local authority="${BASH_REMATCH[4]}"
local path="${BASH_REMATCH[5]}"
local query="${BASH_REMATCH[7]}"
local fragment="${BASH_REMATCH[9]}"
for c in scheme authority query fragment; do
[[ "${!c}" ]] &&
printf '%s[%s]=%q\n' "$name" "$c" "${!c}"
done
# unclear if the path is always set even if empty but it looks that way
printf '%s[path]=%q\n' "$name" "$path"
}
# is a NAME defined ('set' in bash)?
isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME
# is a NAME defined AND empty?
isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME
request_url() {
local server="$1"
local port="$2"