huntnw/crawly

67 lines
1.7 KiB
Plaintext
Raw Normal View History

2020-03-19 11:23:19 +00:00
#!/bin/bash
2020-03-19 12:48:05 +00:00
2020-03-19 11:23:19 +00:00
2020-03-19 13:26:11 +00:00
cleanxss(){ sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g; s/"/\&quot;/g; s/'"'"'/\&#39;/g' | tr "\n" " " | sed -e '$a\'; }
2020-03-19 11:23:19 +00:00
2020-03-19 12:19:11 +00:00
TEMPIDC=/tmp/$RANDOM
function visit(){
2020-03-19 13:26:11 +00:00
regex='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
string=$1
if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]]
then
2020-03-19 12:19:11 +00:00
echo visiting $1
2020-03-19 14:35:34 +00:00
sleep 1
2020-03-19 12:19:11 +00:00
# get URLs
CRAWLED="./content/"`echo $1 | shasum | head -c 5`
echo $CRAWLED
curl $1 -m 1 -L -A "tser/1.0" > $TEMPIDC
2020-03-19 12:35:22 +00:00
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2
2020-03-19 12:35:22 +00:00
2020-03-19 13:26:11 +00:00
2020-03-19 12:19:11 +00:00
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</title' 'RT{gsub(/.*<title[^>]*>/,"");print;exit}' | cleanxss > $CRAWLED
echo $1 | cleanxss >> $CRAWLED
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</p' 'RT{gsub(/.*<p[^>]*>/,"");print}' | cleanxss >> $CRAWLED
2020-03-19 13:26:11 +00:00
else
echo "link $1 is invalid"
fi
2020-03-19 12:19:11 +00:00
}
if [ ! -z $1 ];then
SITE=$1
2020-03-19 12:19:11 +00:00
2020-03-19 13:26:11 +00:00
2020-03-19 12:19:11 +00:00
visit $SITE urls.txt
fi
2020-06-25 13:42:07 +00:00
URLLIST=`cat urls.txt|grep -v '<'|grep -v ' '|sort|uniq`
2020-03-19 12:19:11 +00:00
2020-03-19 13:26:11 +00:00
rm urls.txt
for fn in $URLLIST; do
2020-03-19 12:19:11 +00:00
echo "found URL $fn"
visit $fn urls.txt
done
2020-03-19 20:34:38 +00:00
mv urls.txt urls.txt.bak
2020-03-20 14:23:33 +00:00
cat urls.txt.bak | sort | uniq -w 20 > urls.txt
2020-03-19 20:34:38 +00:00
rm urls.txt.bak
2020-03-19 11:23:19 +00:00
2020-03-20 00:58:13 +00:00
find ./content/ -type 'f' -size -100c -delete
2020-03-19 13:08:50 +00:00
find ./content/ -type 'f' -size +15k -delete
2020-03-19 13:26:11 +00:00
grep -lrIiZ "404 not found" ./content/ | xargs -0 rm --
grep -lrIiZ "403 forbidden" ./content/ | xargs -0 rm --
2020-03-19 11:23:19 +00:00