2012-08-03 10:03:17 +00:00
|
|
|
#!/bin/sh
|
|
|
|
# update feeds, merge with old feeds.
|
2014-03-31 20:46:58 +00:00
|
|
|
# NOTE: assumes "sfeed_*" executables are in $PATH.
|
2012-08-03 10:03:17 +00:00
|
|
|
|
|
|
|
# defaults
|
2015-08-22 14:52:46 +00:00
|
|
|
sfeedpath="$HOME/.sfeed/feeds"
|
2012-08-03 10:03:17 +00:00
|
|
|
|
2018-09-30 17:20:01 +00:00
|
|
|
# used for processing feeds concurrently: wait until ${maxjobs} amount of
|
|
|
|
# feeds are finished at a time.
|
|
|
|
maxjobs=8
|
|
|
|
|
2012-08-03 10:03:17 +00:00
|
|
|
# load config (evaluate shellscript).
|
|
|
|
# loadconfig(configfile)
|
|
|
|
loadconfig() {
|
|
|
|
# allow to specify config via argv[1].
|
2018-09-28 15:11:56 +00:00
|
|
|
if [ "$1" != "" ]; then
|
2021-05-27 10:30:53 +00:00
|
|
|
# get absolute path of config file required for including.
|
|
|
|
config="$1"
|
|
|
|
path=$(readlink -f "${config}" 2>/dev/null)
|
2012-08-03 10:03:17 +00:00
|
|
|
else
|
|
|
|
# default config location.
|
2022-11-27 19:17:14 +00:00
|
|
|
config="$HOME/.config/sfeed/sfeedrc"
|
2021-05-27 10:30:53 +00:00
|
|
|
path="${config}"
|
2012-08-03 10:03:17 +00:00
|
|
|
fi
|
|
|
|
|
2018-09-28 15:11:56 +00:00
|
|
|
# config is loaded here to be able to override $sfeedpath or functions.
|
2021-05-27 10:30:53 +00:00
|
|
|
if [ -r "${path}" ]; then
|
|
|
|
. "${path}"
|
2012-08-03 10:03:17 +00:00
|
|
|
else
|
2022-03-25 14:43:47 +00:00
|
|
|
printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
|
2012-08-03 10:03:17 +00:00
|
|
|
echo "See sfeedrc.example for an example." >&2
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2020-02-04 21:05:29 +00:00
|
|
|
# log(name, s)
|
2019-04-14 13:00:19 +00:00
|
|
|
log() {
|
2022-03-21 09:37:01 +00:00
|
|
|
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
|
|
|
|
}
|
|
|
|
|
|
|
|
# log_error(name, s)
|
|
|
|
log_error() {
|
2019-04-14 13:00:19 +00:00
|
|
|
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
|
2022-03-21 10:47:07 +00:00
|
|
|
# set error exit status indicator for parallel jobs.
|
|
|
|
rm -f "${sfeedtmpdir}/ok"
|
2019-04-14 13:00:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# fetch a feed via HTTP/HTTPS etc.
|
2019-04-14 13:47:56 +00:00
|
|
|
# fetch(name, url, feedfile)
|
|
|
|
fetch() {
|
2019-05-02 18:24:02 +00:00
|
|
|
# fail on redirects, hide User-Agent, timeout is 15 seconds.
|
2019-04-14 13:00:19 +00:00
|
|
|
curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
|
2019-05-02 18:24:02 +00:00
|
|
|
"$2" 2>/dev/null
|
2019-04-14 13:00:19 +00:00
|
|
|
}
|
|
|
|
|
2014-03-31 20:46:58 +00:00
|
|
|
# convert encoding from one encoding to another.
|
2021-01-11 23:50:04 +00:00
|
|
|
# convertencoding(name, from, to)
|
2014-03-31 20:46:58 +00:00
|
|
|
convertencoding() {
|
2021-01-11 23:50:04 +00:00
|
|
|
if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
|
|
|
|
iconv -cs -f "$2" -t "$3" 2> /dev/null
|
2014-03-31 20:46:58 +00:00
|
|
|
else
|
2021-01-15 23:02:12 +00:00
|
|
|
# else no convert, just output.
|
2015-01-04 22:45:51 +00:00
|
|
|
cat
|
2014-03-31 20:46:58 +00:00
|
|
|
fi
|
2012-08-03 10:03:17 +00:00
|
|
|
}
|
|
|
|
|
2021-01-25 18:27:04 +00:00
|
|
|
# parse and convert input, by default XML to the sfeed(5) TSV format.
|
|
|
|
# parse(name, feedurl, basesiteurl)
|
|
|
|
parse() {
|
|
|
|
sfeed "$3"
|
|
|
|
}
|
|
|
|
|
2018-09-28 15:11:56 +00:00
|
|
|
# filter fields.
|
|
|
|
# filter(name)
|
|
|
|
filter() {
|
|
|
|
cat
|
|
|
|
}
|
|
|
|
|
2019-04-14 13:00:19 +00:00
|
|
|
# merge raw files: unique sort by id, title, link.
|
|
|
|
# merge(name, oldfile, newfile)
|
|
|
|
merge() {
|
|
|
|
sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
|
2018-09-28 15:11:56 +00:00
|
|
|
}
|
|
|
|
|
2019-05-15 18:55:38 +00:00
|
|
|
# order by timestamp (descending).
|
|
|
|
# order(name)
|
|
|
|
order() {
|
|
|
|
sort -t ' ' -k1rn,1
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:33:01 +00:00
|
|
|
# internal handler to fetch and process a feed.
|
|
|
|
# _feed(name, feedurl, [basesiteurl], [encoding])
|
|
|
|
_feed() {
|
|
|
|
name="$1"
|
2017-12-16 11:09:31 +00:00
|
|
|
feedurl="$2"
|
|
|
|
basesiteurl="$3"
|
2014-03-31 20:46:58 +00:00
|
|
|
encoding="$4"
|
2019-04-14 13:00:19 +00:00
|
|
|
|
2021-01-01 21:34:04 +00:00
|
|
|
filename="$(printf '%s' "${name}" | tr '/' '_')"
|
2017-12-16 11:09:31 +00:00
|
|
|
sfeedfile="${sfeedpath}/${filename}"
|
2022-03-21 10:47:07 +00:00
|
|
|
tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
|
2019-04-14 13:00:19 +00:00
|
|
|
|
2021-01-12 00:02:37 +00:00
|
|
|
# if file does not exist yet create it.
|
|
|
|
[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
|
|
|
|
|
2019-04-14 13:47:56 +00:00
|
|
|
if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (FETCH)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
|
|
|
|
# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
|
|
|
|
[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
|
|
|
|
|
2021-01-11 23:50:04 +00:00
|
|
|
if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (ENCODING)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
rm -f "${tmpfeedfile}.fetch"
|
|
|
|
|
2021-02-28 12:33:21 +00:00
|
|
|
# if baseurl is empty then use feedurl.
|
2021-03-01 21:27:11 +00:00
|
|
|
if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (PARSE)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
2019-05-12 16:57:01 +00:00
|
|
|
rm -f "${tmpfeedfile}.utf8"
|
2019-04-14 13:00:19 +00:00
|
|
|
|
|
|
|
if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (FILTER)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
rm -f "${tmpfeedfile}.tsv"
|
|
|
|
|
|
|
|
# new feed data is empty: no need for below stages.
|
|
|
|
if [ ! -s "${tmpfeedfile}.filter" ]; then
|
|
|
|
log "${name}" "OK"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 0
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
|
2021-01-12 00:02:37 +00:00
|
|
|
if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (MERGE)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
rm -f "${tmpfeedfile}.filter"
|
|
|
|
|
|
|
|
if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (ORDER)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
|
|
|
rm -f "${tmpfeedfile}.merge"
|
|
|
|
|
2020-02-23 18:37:38 +00:00
|
|
|
# copy
|
|
|
|
if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
|
2022-03-21 09:37:01 +00:00
|
|
|
log_error "${name}" "FAIL (COPY)"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 1
|
2019-04-14 13:00:19 +00:00
|
|
|
fi
|
2020-02-23 18:37:38 +00:00
|
|
|
rm -f "${tmpfeedfile}.order"
|
2018-02-18 13:40:41 +00:00
|
|
|
|
2019-04-14 13:00:19 +00:00
|
|
|
# OK
|
|
|
|
log "${name}" "OK"
|
2022-03-22 08:45:34 +00:00
|
|
|
return 0
|
2021-01-25 18:33:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# fetch and process a feed in parallel.
|
|
|
|
# feed(name, feedurl, [basesiteurl], [encoding])
|
|
|
|
feed() {
|
|
|
|
# wait until ${maxjobs} are finished: will stall the queue if an item
|
|
|
|
# is slow, but it is portable.
|
|
|
|
[ ${signo} -ne 0 ] && return
|
|
|
|
[ $((curjobs % maxjobs)) -eq 0 ] && wait
|
|
|
|
[ ${signo} -ne 0 ] && return
|
|
|
|
curjobs=$((curjobs + 1))
|
|
|
|
|
|
|
|
_feed "$@" &
|
2012-08-03 10:03:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
cleanup() {
|
2018-10-24 16:41:03 +00:00
|
|
|
# remove temporary directory with feed files.
|
2015-06-20 22:15:37 +00:00
|
|
|
rm -rf "${sfeedtmpdir}"
|
2012-08-03 10:03:17 +00:00
|
|
|
}
|
|
|
|
|
2018-10-05 21:55:40 +00:00
|
|
|
sighandler() {
|
|
|
|
signo="$1"
|
|
|
|
# ignore TERM signal for myself.
|
|
|
|
trap -- "" TERM
|
2022-06-20 23:09:14 +00:00
|
|
|
# kill all running children >:D
|
2018-10-05 21:55:40 +00:00
|
|
|
kill -TERM -$$
|
2018-09-10 16:54:13 +00:00
|
|
|
}
|
|
|
|
|
2014-03-31 20:46:58 +00:00
|
|
|
feeds() {
|
2022-03-25 14:43:47 +00:00
|
|
|
printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
|
2014-03-31 20:46:58 +00:00
|
|
|
echo "See sfeedrc.example for an example." >&2
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:35:02 +00:00
|
|
|
main() {
|
|
|
|
# job counter.
|
|
|
|
curjobs=0
|
|
|
|
# signal number received for parent.
|
|
|
|
signo=0
|
|
|
|
# SIGINT: signal to interrupt parent.
|
|
|
|
trap -- "sighandler 2" "INT"
|
|
|
|
# SIGTERM: signal to terminate parent.
|
|
|
|
trap -- "sighandler 15" "TERM"
|
|
|
|
# load config file.
|
|
|
|
loadconfig "$1"
|
|
|
|
# fetch feeds and store in temporary directory.
|
|
|
|
sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
|
2022-03-21 10:47:07 +00:00
|
|
|
mkdir -p "${sfeedtmpdir}/feeds"
|
|
|
|
touch "${sfeedtmpdir}/ok"
|
2021-01-25 18:35:02 +00:00
|
|
|
# make sure path exists.
|
|
|
|
mkdir -p "${sfeedpath}"
|
|
|
|
# fetch feeds specified in config file.
|
|
|
|
feeds
|
|
|
|
# wait till all feeds are fetched (concurrently).
|
|
|
|
[ ${signo} -eq 0 ] && wait
|
2022-03-21 10:47:07 +00:00
|
|
|
# check error exit status indicator for parallel jobs.
|
2022-03-23 18:38:30 +00:00
|
|
|
[ -f "${sfeedtmpdir}/ok" ]
|
2022-03-21 10:47:07 +00:00
|
|
|
status=$?
|
2021-01-25 18:35:02 +00:00
|
|
|
# cleanup temporary files etc.
|
|
|
|
cleanup
|
|
|
|
# on signal SIGINT and SIGTERM exit with signal number + 128.
|
|
|
|
[ ${signo} -ne 0 ] && exit $((signo+128))
|
2022-03-28 11:07:58 +00:00
|
|
|
exit ${status}
|
2021-01-25 18:35:02 +00:00
|
|
|
}
|
|
|
|
|
2021-02-05 00:25:53 +00:00
|
|
|
[ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"
|