forked from acdw/bollux
Bring URL transforming up to spec
URL transformation (formerly called "munging") is now (afaict) fully compliant with the RFC spec. It's also implemented in pure bash and is available as a library at https://git.sr.ht/~acdw/shurlie.
This commit is contained in:
parent
482e2659ae
commit
c62e428c16
202
bollux
202
bollux
|
@ -122,7 +122,7 @@ blastoff() { # load a url
|
|||
URL="$1"
|
||||
|
||||
if $well_formed && [[ "$1" != "$BOLLUX_URL" ]]; then
|
||||
URL="$(run munge_url "$1" "$BOLLUX_URL")"
|
||||
URL="$(run transform_resource "$BOLLUX_URL" "$1")"
|
||||
fi
|
||||
[[ "$URL" != *://* ]] && URL="$BOLLUX_PROTO://$URL"
|
||||
URL="$(trim <<<"$URL")"
|
||||
|
@ -134,95 +134,133 @@ blastoff() { # load a url
|
|||
run handle_response "$URL"
|
||||
}
|
||||
|
||||
munge_url() {
|
||||
local -A new old u
|
||||
eval "$(split_url new <<<"$1")"
|
||||
for k in "${!new[@]}"; do log d "new[$k]=${new[$k]}"; done
|
||||
eval "$(split_url old <<<"$2")"
|
||||
for k in "${!old[@]}"; do log d "old[$k]=${old[$k]}"; done
|
||||
|
||||
u['scheme']="${new['scheme']:-${old['scheme']:-}}"
|
||||
u['authority']="${new['authority']:-${old['authority']:-}}"
|
||||
# XXX this whole path thing is wack
|
||||
if [[ "${new['path']+isset}" ]]; then
|
||||
log d 'new path set'
|
||||
if [[ "${new['path']}" == /* ]]; then
|
||||
log d 'new path == /*'
|
||||
u['path']="${new['path']}"
|
||||
elif [[ "${new['authority']}" == "${old['authority']}" || ! "${new['authority']+isset}" ]]; then
|
||||
p="${old['path']:-}/${new['path']}"
|
||||
log d "$p ( $(normalize_path <<<"$p") )"
|
||||
u['path']="$(normalize_path <<<"$p")"
|
||||
else
|
||||
log d 'u path = new path'
|
||||
u['path']="${new['path']}"
|
||||
fi
|
||||
elif [[ "${new['query']+isset}" || "${new['fragment']+isset}" ]]; then
|
||||
log d 'u path = old path'
|
||||
u['path']="${old['path']}"
|
||||
else
|
||||
u['path']="/"
|
||||
transform_resource() { # transform_resource BASE_URL REFERENCE_URL
|
||||
declare -A R B T # reference, base url, target
|
||||
eval "$(parse_url B "$1")"
|
||||
eval "$(parse_url R "$2")"
|
||||
# A non-strict parser may ignore a scheme in the reference
|
||||
# if it is identical to the base URI's scheme.
|
||||
if ! "${STRICT:-true}" && [[ "${R[scheme]}" == "${B[scheme]}" ]]; then
|
||||
unset "${R[scheme]}"
|
||||
fi
|
||||
u['query']="${new['query']:-}"
|
||||
u['fragment']="${new['fragment']:-}"
|
||||
for k in "${!u[@]}"; do log d "u[$k]=${u[$k]}"; done
|
||||
|
||||
run printf '%s%s%s%s%s\n' \
|
||||
"${u['scheme']}" "${u['authority']}" "${u['path']}" \
|
||||
"${u['query']}" "${u['fragment']}"
|
||||
# basically pseudo-code from spec ported to bash
|
||||
if isdefined "R[scheme]"; then
|
||||
T[scheme]="${R[scheme]}"
|
||||
isdefined "R[authority]" && T[authority]="${R[authority]}"
|
||||
isdefined R[path] &&
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
isdefined "R[query]" && T[query]="${R[query]}"
|
||||
else
|
||||
if isdefined "R[authority]"; then
|
||||
T[authority]="${R[authority]}"
|
||||
isdefined "R[authority]" &&
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
isdefined R[query] && T[query]="${R[query]}"
|
||||
else
|
||||
if isempty "R[path]"; then
|
||||
T[path]="${B[path]}"
|
||||
if isdefined R[query]; then
|
||||
T[query]="${R[query]}"
|
||||
else
|
||||
T[query]="${B[query]}"
|
||||
fi
|
||||
else
|
||||
if [[ "${R[path]}" == /* ]]; then
|
||||
T[path]="$(remove_dot_segments "${R[path]}")"
|
||||
else
|
||||
T[path]="$(merge_paths "B[authority]" "${B[path]}" "${R[path]}")"
|
||||
T[path]="$(remove_dot_segments "${T[path]}")"
|
||||
fi
|
||||
isdefined R[query] && T[query]="${R[query]}"
|
||||
fi
|
||||
T[authority]="${B[authority]}"
|
||||
fi
|
||||
T[scheme]="${B[scheme]}"
|
||||
fi
|
||||
isdefined R[fragment] && T[fragment]="${R[fragment]}"
|
||||
# cf. 5.3 -- recomposition
|
||||
local r=""
|
||||
isdefined "T[scheme]" && r="$r${T[scheme]}:"
|
||||
isdefined "T[authority]" && r="$r//${T[authority]}"
|
||||
r="$r${T[path]}"
|
||||
isdefined T[query] && r="$r?${T[query]}"
|
||||
isdefined T[fragment] && r="$r#${T[fragment]}"
|
||||
printf '%s\n' "$r"
|
||||
}
|
||||
|
||||
normalize_path() {
|
||||
gawk '{
|
||||
split($0, path, /\//)
|
||||
for (c in path) {
|
||||
if (path[c] == "" || path[c] == ".") {
|
||||
continue
|
||||
}
|
||||
if (path[c] == "..") {
|
||||
sub(/[^\/]+$/, "", ret)
|
||||
continue
|
||||
}
|
||||
if (! ret || match(ret, /\/$/)) {
|
||||
slash = ""
|
||||
} else {
|
||||
slash = "/"
|
||||
}
|
||||
ret = ret slash path[c]
|
||||
}
|
||||
print (ret ~ /^\// ? "" : "/") ret
|
||||
}'
|
||||
merge_paths() { # 5.2.3
|
||||
# shellcheck disable=2034
|
||||
B_authority="$1"
|
||||
B_path="$2"
|
||||
R_path="$3"
|
||||
# if R_path is empty, get rid of // in B_path
|
||||
if [[ -z "$R_path" ]]; then
|
||||
printf '%s\n' "${B_path//\/\//\//}"
|
||||
return
|
||||
fi
|
||||
|
||||
if isdefined "B_authority" && isempty "B_path"; then
|
||||
printf '/%s\n' "${R_path//\/\//\//}"
|
||||
else
|
||||
if [[ "$B_path" == */* ]]; then
|
||||
B_path="${B_path%/*}/"
|
||||
else
|
||||
B_path=""
|
||||
fi
|
||||
printf '%s/%s\n' "${B_path%/}" "${R_path#/}"
|
||||
fi
|
||||
}
|
||||
|
||||
split_url() {
|
||||
gawk -vvar="$1" '{
|
||||
if (match($0, /^[A-Za-z]+:/)) {
|
||||
arr["scheme"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\/\/[^\/?#]+?/) || (match($0, /^[^\/?#]+?/) && scheme)) {
|
||||
arr["authority"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\/?[^?#]+/)) {
|
||||
arr["path"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^\?[^#]+/)) {
|
||||
arr["query"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
if (match($0, /^#.*/)) {
|
||||
arr["fragment"] = substr($0, RSTART, RLENGTH)
|
||||
$0 = substr($0, RLENGTH + 1)
|
||||
}
|
||||
for (part in arr) {
|
||||
sub(/[[:space:]]+$/, "", arr[part])
|
||||
printf var "[\"%s\"]=\"%s\"\n", part, arr[part]
|
||||
}
|
||||
}'
|
||||
remove_dot_segments() { # 5.2.4
|
||||
local input="$1"
|
||||
local output=
|
||||
# ^/\.(/|$) - BASH_REMATCH[0]
|
||||
while [[ "$input" ]]; do
|
||||
if [[ "$input" =~ ^\.\.?/ ]]; then
|
||||
input="${input#${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" =~ ^/\.(/|$) ]]; then
|
||||
input="/${input#${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" =~ ^/\.\.(/|$) ]]; then
|
||||
input="/${input#${BASH_REMATCH[0]}}"
|
||||
[[ "$output" =~ /?[^/]+$ ]]
|
||||
output="${output%${BASH_REMATCH[0]}}"
|
||||
elif [[ "$input" == . || "$input" == .. ]]; then
|
||||
input=
|
||||
else
|
||||
[[ $input =~ ^(/?[^/]*)(/?.*)$ ]] || echo NOMATCH >&2
|
||||
output="$output${BASH_REMATCH[1]}"
|
||||
input="${BASH_REMATCH[2]}"
|
||||
fi
|
||||
done
|
||||
printf '%s\n' "${output//\/\//\//}"
|
||||
}
|
||||
|
||||
parse_url() { # eval "$(split_url NAME STRING)" => NAME[...]
|
||||
local name="$1"
|
||||
local string="$2"
|
||||
local re='^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
|
||||
[[ $string =~ $re ]] || return $?
|
||||
|
||||
local scheme="${BASH_REMATCH[2]}"
|
||||
local authority="${BASH_REMATCH[4]}"
|
||||
local path="${BASH_REMATCH[5]}"
|
||||
local query="${BASH_REMATCH[7]}"
|
||||
local fragment="${BASH_REMATCH[9]}"
|
||||
|
||||
for c in scheme authority query fragment; do
|
||||
[[ "${!c}" ]] &&
|
||||
printf '%s[%s]=%q\n' "$name" "$c" "${!c}"
|
||||
done
|
||||
# unclear if the path is always set even if empty but it looks that way
|
||||
printf '%s[path]=%q\n' "$name" "$path"
|
||||
}
|
||||
|
||||
# is a NAME defined ('set' in bash)?
|
||||
isdefined() { [[ "${!1+x}" ]]; } # isdefined NAME
|
||||
# is a NAME defined AND empty?
|
||||
isempty() { [[ ! "${!1-x}" ]]; } # isempty NAME
|
||||
|
||||
request_url() {
|
||||
local server="$1"
|
||||
local port="$2"
|
||||
|
|
Loading…
Reference in New Issue