mirror of
https://xfnw.ttm.sh/git/huntnw.git
synced 2024-09-14 13:03:27 +00:00
67 lines
1.7 KiB
Bash
Executable File
67 lines
1.7 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
|
|
|
|
cleanxss(){ sed 's/&/\&/g; s/</\</g; s/>/\>/g; s/"/\"/g; s/'"'"'/\'/g' | tr "\n" " " | sed -e '$a\'; }
|
|
|
|
TEMPIDC=/tmp/$RANDOM
|
|
|
|
function visit(){
|
|
regex='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
|
|
string=$1
|
|
if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]]
|
|
then
|
|
|
|
echo visiting $1
|
|
sleep 1
|
|
# get URLs
|
|
|
|
CRAWLED="./content/"`echo $1 | shasum | head -c 5`
|
|
echo $CRAWLED
|
|
curl $1 -m 1 -L -A "huntnw/1.0" > $TEMPIDC
|
|
|
|
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2
|
|
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2
|
|
|
|
|
|
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</title' 'RT{gsub(/.*<title[^>]*>/,"");print;exit}' | cleanxss > $CRAWLED
|
|
echo $1 | cleanxss >> $CRAWLED
|
|
cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</p' 'RT{gsub(/.*<p[^>]*>/,"");print}' | cleanxss >> $CRAWLED
|
|
else
|
|
echo "link $1 is invalid"
|
|
fi
|
|
}
|
|
|
|
if [ ! -z $1 ];then
|
|
|
|
SITE=$1
|
|
|
|
|
|
|
|
visit $SITE urls.txt
|
|
|
|
fi
|
|
|
|
URLLIST=`cat urls.txt|grep -v '<'|grep -v ' '|sort|uniq`
|
|
|
|
rm urls.txt
|
|
|
|
for fn in $URLLIST; do
|
|
echo "found URL $fn"
|
|
visit $fn urls.txt
|
|
done
|
|
|
|
mv urls.txt urls.txt.bak
|
|
cat urls.txt.bak | sort | uniq -w 20 > urls.txt
|
|
rm urls.txt.bak
|
|
|
|
find ./content/ -type 'f' -size -100c -delete
|
|
find ./content/ -type 'f' -size +15k -delete
|
|
|
|
|
|
|
|
grep -lrIiZ "404 not found" ./content/ | xargs -0 rm --
|
|
grep -lrIiZ "403 forbidden" ./content/ | xargs -0 rm --
|
|
|
|
|