sfeed/sfeed_update

226 lines
5.5 KiB
Bash
Executable File

#!/bin/sh
# update feeds, merge with old feeds.
# NOTE: assumes "sfeed_*" executables are in $PATH.
# defaults
sfeedpath="$HOME/.sfeed/feeds"
# used for processing feeds concurrently: wait until ${maxjobs} amount of
# feeds are finished at a time.
maxjobs=8
# load config (evaluate shellscript).
# loadconfig(configfile)
loadconfig() {
# allow to specify config via argv[1].
if [ "$1" != "" ]; then
# get absolute path of config file required for including.
config="$1"
path=$(readlink -f "${config}" 2>/dev/null)
else
# default config location.
config="$HOME/.config/sfeed/sfeedrc"
path="${config}"
fi
# config is loaded here to be able to override $sfeedpath or functions.
if [ -r "${path}" ]; then
. "${path}"
else
printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
echo "See sfeedrc.example for an example." >&2
exit 1
fi
}
# log(name, s)
log() {
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
}
# log_error(name, s)
log_error() {
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
# set error exit status indicator for parallel jobs.
rm -f "${sfeedtmpdir}/ok"
}
# fetch a feed via HTTP/HTTPS etc.
# fetch(name, url, feedfile)
fetch() {
# fail on redirects, hide User-Agent, timeout is 15 seconds.
curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
"$2" 2>/dev/null
}
# convert encoding from one encoding to another.
# convertencoding(name, from, to)
convertencoding() {
if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
iconv -cs -f "$2" -t "$3" 2> /dev/null
else
# else no convert, just output.
cat
fi
}
# parse and convert input, by default XML to the sfeed(5) TSV format.
# parse(name, feedurl, basesiteurl)
parse() {
sfeed "$3"
}
# filter fields.
# filter(name)
filter() {
cat
}
# merge raw files: unique sort by id, title, link.
# merge(name, oldfile, newfile)
merge() {
sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
}
# order by timestamp (descending).
# order(name)
order() {
sort -t ' ' -k1rn,1
}
# internal handler to fetch and process a feed.
# _feed(name, feedurl, [basesiteurl], [encoding])
_feed() {
name="$1"
feedurl="$2"
basesiteurl="$3"
encoding="$4"
filename="$(printf '%s' "${name}" | tr '/' '_')"
sfeedfile="${sfeedpath}/${filename}"
tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
# if file does not exist yet create it.
[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
log_error "${name}" "FAIL (FETCH)"
return 1
fi
# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
log_error "${name}" "FAIL (ENCODING)"
return 1
fi
rm -f "${tmpfeedfile}.fetch"
# if baseurl is empty then use feedurl.
if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
log_error "${name}" "FAIL (PARSE)"
return 1
fi
rm -f "${tmpfeedfile}.utf8"
if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
log_error "${name}" "FAIL (FILTER)"
return 1
fi
rm -f "${tmpfeedfile}.tsv"
# new feed data is empty: no need for below stages.
if [ ! -s "${tmpfeedfile}.filter" ]; then
log "${name}" "OK"
return 0
fi
if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
log_error "${name}" "FAIL (MERGE)"
return 1
fi
rm -f "${tmpfeedfile}.filter"
if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
log_error "${name}" "FAIL (ORDER)"
return 1
fi
rm -f "${tmpfeedfile}.merge"
# copy
if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
log_error "${name}" "FAIL (COPY)"
return 1
fi
rm -f "${tmpfeedfile}.order"
# OK
log "${name}" "OK"
return 0
}
# fetch and process a feed in parallel.
# feed(name, feedurl, [basesiteurl], [encoding])
feed() {
# wait until ${maxjobs} are finished: will stall the queue if an item
# is slow, but it is portable.
[ ${signo} -ne 0 ] && return
[ $((curjobs % maxjobs)) -eq 0 ] && wait
[ ${signo} -ne 0 ] && return
curjobs=$((curjobs + 1))
_feed "$@" &
}
cleanup() {
# remove temporary directory with feed files.
rm -rf "${sfeedtmpdir}"
}
sighandler() {
signo="$1"
# ignore TERM signal for myself.
trap -- "" TERM
# kill all running children >:D
kill -TERM -$$
}
feeds() {
printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
echo "See sfeedrc.example for an example." >&2
}
main() {
# job counter.
curjobs=0
# signal number received for parent.
signo=0
# SIGINT: signal to interrupt parent.
trap -- "sighandler 2" "INT"
# SIGTERM: signal to terminate parent.
trap -- "sighandler 15" "TERM"
# load config file.
loadconfig "$1"
# fetch feeds and store in temporary directory.
sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
mkdir -p "${sfeedtmpdir}/feeds"
touch "${sfeedtmpdir}/ok"
# make sure path exists.
mkdir -p "${sfeedpath}"
# fetch feeds specified in config file.
feeds
# wait till all feeds are fetched (concurrently).
[ ${signo} -eq 0 ] && wait
# check error exit status indicator for parallel jobs.
[ -f "${sfeedtmpdir}/ok" ]
status=$?
# cleanup temporary files etc.
cleanup
# on signal SIGINT and SIGTERM exit with signal number + 128.
[ ${signo} -ne 0 ] && exit $((signo+128))
exit ${status}
}
[ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"