#!/bin/bash cleanxss(){ sed 's/&/\&/g; s//\>/g; s/"/\"/g; s/'"'"'/\'/g' | tr "\n" " " | sed -e '$a\'; } TEMPIDC=/tmp/$RANDOM function visit(){ regex='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]' string=$1 if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]] then echo visiting $1 sleep 1 # get URLs CRAWLED="./content/"`echo $1 | shasum | head -c 5` echo $CRAWLED curl $1 -m 1 -L -A "tser/1.0" > $TEMPIDC cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2 cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2 cat $TEMPIDC | awk -v IGNORECASE=1 -v RS=']*>/,"");print;exit}' | cleanxss > $CRAWLED echo $1 | cleanxss >> $CRAWLED cat $TEMPIDC | awk -v IGNORECASE=1 -v RS=']*>/,"");print}' | cleanxss >> $CRAWLED else echo "link $1 is invalid" fi } if [ ! -z $1 ];then SITE=$1 visit $SITE urls.txt fi URLLIST=`cat urls.txt|sort|uniq` rm urls.txt for fn in $URLLIST; do echo "found URL $fn" visit $fn urls.txt done mv urls.txt urls.txt.bak cat urls.txt.bak | sort | uniq -w 20 > urls.txt rm urls.txt.bak find ./content/ -type 'f' -size -100c -delete find ./content/ -type 'f' -size +15k -delete grep -lrIiZ "404 not found" ./content/ | xargs -0 rm -- grep -lrIiZ "403 forbidden" ./content/ | xargs -0 rm --