sfeed_update: add filter(), order() support per feed + improvements

Pass the name parameter to the functions and add these to the pipeline. They
can be overridden in the config.

- add the ability to change the merge logic per feed.
- add the ability to filter lines and fields per feed.
- add the ability to order lines differently per feed.
- add filter example to README.

- code-style:
  - fetchfeed consistency in parameter order.
  - change [ x"" = x"" ] to [ "" = "" ]. Simplify some if statements.
  - wrap long line in fetchfeed().
  - use signal names for trap.
This commit is contained in:
Hiltjo Posthuma 2018-09-28 17:11:56 +02:00
parent 5aa78eb161
commit cc9f0d5549
3 changed files with 117 additions and 45 deletions

60
README
View File

@ -127,12 +127,18 @@ Files read at runtime by sfeed_update(1)
----------------------------------------
sfeedrc - Config file. This file is evaluated as a shellscript in
sfeed_update(1). You can for example override the fetchfeed()
function to use wget(1), OpenBSD ftp(1) an other download program or
you can override the merge() function to change the merge logic. The
function feeds() is called to fetch the feeds. The function feed()
can safely be executed concurrently as a background job in your
sfeedrc(5) config file to make updating faster.
sfeed_update(1).
Atleast the following functions can be overridden per feed:
- fetchfeed: to use wget(1), OpenBSD ftp(1) or an other download program.
- merge: to change the merge logic.
- filter: to filter on fields.
- order: to change the sort order.
The function feeds() is called to fetch the feeds. The function feed() can
safely be executed concurrently as a background job in your sfeedrc(5) config
file to make updating faster.
Files written at runtime by sfeed_update(1)
@ -212,6 +218,48 @@ argument is optional):
- - -
# filter fields.
# filter(name)
filter() {
case "$1" in
"tweakers")
LC_LOCALE=C awk -F ' ' 'BEGIN {
OFS = " ";
}
# skip ads.
$2 ~ /^ADV:/ {
next;
}
# shorten link.
{
if (match($3, /^https:\/\/tweakers\.net\/(nieuws|downloads|reviews|geek)\/[0-9]+\//)) {
$3 = substr($3, RSTART, RLENGTH);
}
print $0;
}';;
"yt BSDNow")
# filter only BSD Now from channel.
LC_LOCALE=C awk -F ' ' '$2 ~ / \| BSD Now/';;
*)
cat;;
esac | \
# replace youtube links with embed links.
sed 's@www.youtube.com/watch?v=@www.youtube.com/embed/@g' | \
# try to strip utm_ tracking parameters.
LC_LOCALE=C awk -F ' ' 'BEGIN {
OFS = " ";
}
{
gsub(/\?utm_([^&]+)/, "?", $3);
gsub(/&utm_([^&]+)/, "", $3);
gsub(/\?&/, "?", $3);
gsub(/[\?&]+$/, "", $3);
print $0;
}'
}
- - -
Over time your feeds file might become quite big. You can archive items from a
specific date by doing for example:

View File

@ -9,7 +9,7 @@ sfeedpath="$HOME/.sfeed/feeds"
# loadconfig(configfile)
loadconfig() {
# allow to specify config via argv[1].
if [ ! x"$1" = x"" ]; then
if [ "$1" != "" ]; then
# get absolute path of config file.
config=$(readlink -f "$1")
else
@ -17,8 +17,7 @@ loadconfig() {
config="$HOME/.sfeed/sfeedrc"
fi
# load config: config is loaded here to be able to override $sfeedpath
# or functions.
# config is loaded here to be able to override $sfeedpath or functions.
if [ -r "${config}" ]; then
. "${config}"
else
@ -28,30 +27,11 @@ loadconfig() {
fi
}
# merge raw files.
# merge(oldfile, newfile)
merge() {
# unique sort by id, title, link.
# order by timestamp (desc).
(sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$1" "$2" 2>/dev/null) |
sort -t ' ' -k1rn,1
}
# fetch a feed via HTTP/HTTPS etc.
# fetchfeed(url, name, feedfile)
fetchfeed() {
if curl -L --max-redirs 0 -H 'User-Agent:' -f -s -S -m 15 -z "$3" "$1" 2>/dev/null; then
printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
else
printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
fi
}
# convert encoding from one encoding to another.
# convertencoding(from, to)
convertencoding() {
# if from != to
if [ ! "$1" = "" ] && [ ! "$2" = "" ] && [ ! "$1" = "$2" ]; then
if [ "$1" != "" ] && [ "$2" != "" ] && [ "$1" != "$2" ]; then
iconv -cs -f "$1" -t "$2" 2> /dev/null
else
# else no convert, just output
@ -59,6 +39,35 @@ convertencoding() {
fi
}
# merge raw files: unique sort by id, title, link.
# merge(name, oldfile, newfile)
merge() {
sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
}
# filter fields.
# filter(name)
filter() {
cat
}
# order by timestamp (descending).
# order(name)
order() {
sort -t ' ' -k1rn,1
}
# fetch a feed via HTTP/HTTPS etc.
# fetchfeed(name, url, feedfile)
fetchfeed() {
if curl -L --max-redirs 0 -H "User-Agent:" -f -s -S -m 15 \
-z "$3" "$2" 2>/dev/null; then
printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
else
printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
fi
}
# fetch and parse feed.
# feed(name, feedurl, [basesiteurl], [encoding])
feed() {
@ -72,14 +81,14 @@ feed() {
sfeedfile="${sfeedpath}/${filename}"
if [ ! "${encoding}" = "" ]; then
fetchfeed "${feedurl}" "${name}" "${sfeedfile}" | \
fetchfeed "${name}" "${feedurl}" "${sfeedfile}" | \
convertencoding "${encoding}" "utf-8"
else # detect encoding.
tmpencfile="${tmpfeedfile}.enc"
fetchfeed "${feedurl}" "${name}" "${sfeedfile}" > "${tmpencfile}"
fetchfeed "${name}" "${feedurl}" "${sfeedfile}" > "${tmpencfile}"
detectenc=$(sfeed_xmlenc < "${tmpencfile}")
convertencoding "${detectenc}" "utf-8" < "${tmpencfile}"
fi | sfeed "${basesiteurl}" > "${tmpfeedfile}"
fi | sfeed "${basesiteurl}" | filter "${name}" > "${tmpfeedfile}"
# get new data and merge with old.
sfeedfilenew="${sfeedpath}/${filename}.new"
@ -87,18 +96,20 @@ feed() {
if [ -s "${tmpfeedfile}" ]; then
# if file exists, merge
if [ -e "${sfeedfile}" ]; then
merge "${sfeedfile}" "${tmpfeedfile}" > "${sfeedfilenew}"
merge "${name}" "${sfeedfile}" "${tmpfeedfile}" | \
order "${name}" > "${sfeedfilenew}"
# overwrite old file with updated file
mv "${sfeedfilenew}" "${sfeedfile}"
else
merge "/dev/null" "${tmpfeedfile}" > "${sfeedfile}"
merge "${name}" "/dev/null" "${tmpfeedfile}" | \
order "${name}" > "${sfeedfile}"
fi
fi) &
}
cleanup() {
# remove temporary files
# remove temporary files.
rm -rf "${sfeedtmpdir}"
}
@ -114,9 +125,9 @@ feeds() {
# kill whole current process group on ^C (SIGINT).
isinterrupted="0"
# SIGTERM: signal to terminate parent.
trap -- "interrupted" "15"
trap -- "interrupted" "TERM"
# SIGINT: kill all running childs >:D
trap -- "kill -TERM -$$" "2"
trap -- "kill -TERM -$$" "INT"
# load config file.
loadconfig "$1"
# fetch feeds and store in temporary file.

View File

@ -1,4 +1,4 @@
.Dd August 5, 2015
.Dd September 28, 2018
.Dt SFEED_UPDATE 1
.Os
.Sh NAME
@ -29,15 +29,28 @@ section for more information.
Config file, see the sfeedrc.example file for an example.
This file is evaluated as a shellscript in
.Nm .
You can for example override the fetchfeed() function to
use
.Xr curl 1 ,
.Pp
Atleast the following functions can be overridden per feed:
.Bl -tag -width 17n
.It fetchfeed
to use
.Xr wget 1 ,
or an other network downloader or you can override the merge() function to
change the merge logic.
OpenBSD
.Xr ftp 1
or an other download program.
.It merge
to change the merge logic.
.It filter
to filter on fields.
.It order
to change the sort order.
.El
.Pp
The function feeds() is called to fetch the feeds.
By default the function feed() is executed concurrently as a background job to
speedup updating.
The function feed() can safely be executed concurrently as a background job in
your
.Xr sfeedrc 5
config file to make updating faster.
.El
.Sh FILES WRITTEN
.Bl -tag -width 17n