huntnw/crawly

67 lines
1.7 KiB
Bash
Executable File

#!/bin/bash
cleanxss(){ sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g; s/"/\&quot;/g; s/'"'"'/\&#39;/g' | tr "\n" " " | sed -e '$a\'; }
TEMPIDC=/tmp/$RANDOM
function visit(){
regex='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
string=$1
if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]]
then
echo visiting $1
sleep 1
# get URLs
CRAWLED="./content/"`echo $1 | shasum | head -c 5`
echo $CRAWLED
curl $1 -m 1 -L -A "tser/1.0" > $TEMPIDC
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</title' 'RT{gsub(/.*<title[^>]*>/,"");print;exit}' | cleanxss > $CRAWLED
echo $1 | cleanxss >> $CRAWLED
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</p' 'RT{gsub(/.*<p[^>]*>/,"");print}' | cleanxss >> $CRAWLED
else
echo "link $1 is invalid"
fi
}
if [ ! -z $1 ];then
SITE=$1
visit $SITE urls.txt
fi
URLLIST=`cat urls.txt`
rm urls.txt
for fn in $URLLIST; do
echo "found URL $fn"
visit $fn urls.txt
done
mv urls.txt urls.txt.bak
cat urls.txt.bak | sort | uniq -w 20 > urls.txt
rm urls.txt.bak
find ./content/ -type 'f' -size -100c -delete
find ./content/ -type 'f' -size +15k -delete
grep -lrIiZ "404 not found" ./content/ | xargs -0 rm --
grep -lrIiZ "403 forbidden" ./content/ | xargs -0 rm --