Bring URL transforming up to spec
URL transformation (formerly called "munging") is now (afaict) fully compliant with the RFC spec. It's also implemented in pure bash and is available as a library at https://git.sr.ht/~acdw/shurlie.
This commit is contained in:
parent
482e2659ae
commit
c62e428c16
202
bollux
202
bollux
|
@ -122,7 +122,7 @@ blastoff() { # load a url
|
|||
URL="$1"
|
||||
|
||||
if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then
|
||||
URL="$(run munge_url "$1" "$BOLLUX_URL")"
|
||||
URL="$(run transform_resource "$BOLLUX_URL" "$1")"
|
||||
fi
|
||||
[[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL"
|
||||
URL="$(trim <<<"$URL")"
|
||||
|
@ -134,95 +134,133 @@ blastoff() { # load a url
|
|||
run handle_response "$URL"
|
||||
}
|
||||
|
||||
munge_url() {
|
||||
local -A new old u
|
||||
eval "$(split_url new <<<"$1")"
|
||||
for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done
|
||||
eval "$(split_url old <<<"$2")"
|
||||
for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done
|
||||
|
||||
u['scheme']="${new['scheme']:-${old['scheme']:-}}"
|
||||
u['authority']="${new['authority']:-${old['authority']:-}}"
|
||||
# XXX this whole path thing is wack
|
||||
if [[ "${new['path']+isset}" ]]; then
|
||||
log d 'new path set'
|
||||
if [[ "${new['path']}" == /* ]]; then
|
||||
log d 'new path == /*'
|
||||
u['path']="${new['path']}"
|
||||
elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then
|
||||
p="${old['path']:-}/${new['path']}"
|
||||
log d "$p ( $(normalize_path <<<"$p") )"
|
||||
u['path']="$(normalize_path <<<"$p")"
|
||||
else
|
||||
log d 'u path = new path'
|
||||
u['path']="${new['path']}"
|
||||
fi
|
||||
elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then
|
||||
log d 'u path = old path'
|
||||
u['path']="${old['path']}"
|
||||
else
|
||||
u['path']="/"
|
||||
transform_resource() { # transform_resource BASE_URL REFERENCE_URL
|
||||
declare -A R B T # reference, base url, target
|
||||
eval "$(parse_url B "$1")"
|
||||
eval "$(parse_url R "$2")"
|
||||
# A non-strict parser may ignore a scheme in the reference
|
||||
# if it is identical to the base URI's scheme.
|
||||
if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then
|
||||
unset "${R[scheme]}"
|
||||
fi
|
||||
u['query']="${new['query']:-}"
|
||||
u['fragment']="${new['fragment']:-}"
|
||||
for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done
|
||||
|
||||
run printf '%s%s%s%s%s\n' \
|
||||
"${u['scheme']}" "${u['authority']}" "${u['path']}" \
|
||||
"${u['query']}" "${u['fragment']}"
|
||||
# basically pseudo-code from spec ported to bash
|
||||
if isdefined "R[scheme]"; then
|
||||
T[scheme]="${R[scheme]}"
|
||||
isdefined "R[authority]" && T[authority]="${R[authority]}"
|
||||
isdefined R[path] &&
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
isdefined "R[query]" && T[query]="${R[query]}"
|
||||
else
|
||||
if isdefined "R[authority]"; then
|
||||
T[authority]="${R[authority]}"
|
||||
isdefined "R[authority]" &&
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
isdefined R[query] && T[query]="${R[query]}"
|
||||
else
|
||||
if isempty "R[path]"; then
|
||||
T[path]="${B[path]}"
|
||||
if isdefined R[query]; then
|
||||
T[query]="${R[query]}"
|
||||
else
|
||||
T[query]="${B[query]}"
|
||||
fi
|
||||
else
|
||||
if [[ "${R[path]}" == /* ]]; then
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
else
|
||||
T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")"
|
||||
T[path]="$(remove_dot_segments "${T[path]}")"
|
||||
fi
|
||||
isdefined R[query] && T[query]="${R[query]}"
|
||||
fi
|
||||
T[authority]="${B[authority]}"
|
||||
fi
|
||||
T[scheme]="${B[scheme]}"
|
||||
fi
|
||||
isdefined R[fragment] && T[fragment]="${R[fragment]}"
|
||||
# cf. 5.3 -- recomposition
|
||||
local r=""
|
||||
isdefined "T[scheme]" && r="$r${T[scheme]}:"
|
||||
isdefined "T[authority]" && r="$r//${T[authority]}"
|
||||
r="$r${T[path]}"
|
||||
isdefined T[query] && r="$r?${T[query]}"
|
||||
isdefined T[fragment] && r="$r#${T[fragment]}"
|
||||
printf '%s\n' "$r"
|
||||
}
|
||||
|
||||
normalize_path() {
|
||||
gawk '{
|
||||
split($0, path, /\//)
|
||||
for (c in path) {
|
||||
if (path[c] == "" || path[c] == ".") {
|
||||
continue
|
||||
}
|
||||
if (path[c] == "..") {
|
||||
sub(/[^\/]+$/, "", ret)
|
||||
continue
|
||||
}
|
||||
if (! ret || match(ret, /\/$/)) {
|
||||
slash = ""
|
||||
} else {
|
||||
slash = "/"
|
||||
}
|
||||
ret = ret slash path[c]
|
||||
}
|
||||
print (ret ~ /^\// ? "" : "/") ret
|
||||
}'
|
||||
merge_paths() { # 5.2.3
|
||||
# shellcheck disable=2034
|
||||
B_authority="$1"
|
||||
B_path="$2"
|
||||
R_path="$3"
|
||||
# if R_path is empty, get rid of // in B_path
|
||||
if [[ -z "$R_path" ]]; then
|
||||
printf '%s\n' "${B_path//\/\//\//}"
|
||||
return
|
||||
fi
|
||||
|
||||
if isdefined "B_authority" && isempty "B_path"; then
|
||||
printf '/%s\n' "${R_path//\/\//\//}"
|
||||
else
|
||||
if [[ "$B_path" == */* ]]; then
|
||||
B_path="${B_path%/*}/"
|
||||
else
|
||||
B_path=""
|
||||
fi
|
||||
printf '%s/%s\n' "${B_path%/}" "${R_path#/}"
|
||||
fi
|
||||
}
|
||||
|
||||
split_url() {
|
||||
gawk -vvar="$1" '{
|
||||
if (match($0, /^[A-Za-z]+:/)) {
|
||||
arr["scheme"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) {
|
||||
arr["authority"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\/?[^?#]+/)) {
|
||||
arr["path"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\?[^#]+/)) {
|
||||
arr["query"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^#.*/)) {
|
||||
arr["fragment"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
for (part in arr) {
|
||||
sub(/[[:space:]]+$/, "", arr[part])
|
||||
printf var "[\"%s\"]=\"%s\"\n", part, arr[part]
|
||||
}
|
||||
}'
|
||||
remove_dot_segments() { # 5.2.4
|
||||
local input="$1"
|
||||
local output=
|
||||
# ^/\.(/|$) - BASH_REMATCH[0]
|
||||
while [[ "$input" ]]; do
|
||||
if [[ "$input" =~ ^\.\.?/ ]]; then
|
||||
input="${input#${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" =~ ^/\.(/|$) ]]; then
|
||||
input="/${input#${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" =~ ^/\.\.(/|$) ]]; then
|
||||
input="/${input#${BASH_REMATCH[0]}}"
|
||||
[[ "$output" =~ /?[^/]+$ ]]
|
||||
output="${output%${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" == . || "$input" == .. ]]; then
|
||||
input=
|
||||
else
|
||||
[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
|
||||
output="$output${BASH_REMATCH[1]}"
|
||||
input="${BASH_REMATCH[2]}"
|
||||
fi
|
||||
done
|
||||
printf '%s\n' "${output//\/\//\//}"
|
||||
}
|
||||
|
||||
parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
|
||||
local name="$1"
|
||||
local string="$2"
|
||||
local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
|
||||
[[ $string =~ $re ]] || return $?
|
||||
|
||||
local scheme="${BASH_REMATCH[2]}"
|
||||
local authority="${BASH_REMATCH[4]}"
|
||||
local path="${BASH_REMATCH[5]}"
|
||||
local query="${BASH_REMATCH[7]}"
|
||||
local fragment="${BASH_REMATCH[9]}"
|
||||
|
||||
for c in scheme authority query fragment; do
|
||||
[[ "${!c}" ]] &&
|
||||
printf '%s[%s]=%q\n' "$name" "$c" "${!c}"
|
||||
done
|
||||
# unclear if the path is always set even if empty but it looks that way
|
||||
printf '%s[path]=%q\n' "$name" "$path"
|
||||
}
|
||||
|
||||
# is a NAME defined ('set' in bash)?
|
||||
isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME
|
||||
# is a NAME defined AND empty?
|
||||
isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME
|
||||
|
||||
request_url() {
|
||||
local server="$1"
|
||||
local port="$2"
|
||||
|
|
Loading…
Reference in New Issue