huntnw/crawly

#!/bin/bash


cleanxss(){ sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g; s/"/\&quot;/g; s/'"'"'/\&#39;/g' | tr "\n" " " | sed -e '$a\'; }

TEMPIDC=/tmp/$RANDOM

function visit(){
    regex='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
string=$1
if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]]
then 
    
    echo visiting $1
    sleep 1
    # get URLs  

    CRAWLED="./content/"`echo $1 | shasum | head -c 5`
    echo $CRAWLED                                                                                                                                          
    curl $1 -m 1 -L -A "tser/1.0" > $TEMPIDC
 
    cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2
    cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2


    cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</title' 'RT{gsub(/.*<title[^>]*>/,"");print;exit}' | cleanxss > $CRAWLED
    echo $1 | cleanxss >> $CRAWLED
    cat $TEMPIDC | awk -v IGNORECASE=1 -v RS='</p' 'RT{gsub(/.*<p[^>]*>/,"");print}' | cleanxss >> $CRAWLED
else
echo "link $1 is invalid"
fi
}

if [ ! -z $1 ];then

SITE=$1 


visit $SITE urls.txt

fi

URLLIST=`cat urls.txt|grep -v '<'|grep -v ' '|sort|uniq`

rm urls.txt

for fn in $URLLIST; do
echo "found URL $fn"
visit $fn urls.txt
done

mv urls.txt urls.txt.bak
cat urls.txt.bak | sort | uniq -w 20 > urls.txt
rm urls.txt.bak

find ./content/ -type 'f' -size -100c -delete
find ./content/ -type 'f' -size +15k -delete


grep -lrIiZ "404 not found" ./content/ | xargs -0 rm --
grep -lrIiZ "403 forbidden" ./content/ | xargs -0 rm --
gitignore 2020-03-19 11:23:19 +00:00			`#!/bin/bash`

recursive crawling 2020-03-19 12:48:05 +00:00
gitignore 2020-03-19 11:23:19 +00:00
check urls 2020-03-19 13:26:11 +00:00			`cleanxss(){ sed 's/&/\&/g; s/</\</g; s/>/\>/g; s/"/\"/g; s/'"'"'/\'/g' \| tr "\n" " " \| sed -e '$a\'; }`
gitignore 2020-03-19 11:23:19 +00:00
crawl websites 2020-03-19 12:19:11 +00:00			`TEMPIDC=/tmp/$RANDOM`

			`function visit(){`
check urls 2020-03-19 13:26:11 +00:00			`regex='(https?\|ftp\|file)://[-A-Za-z0-9\+&@#/%?=~_\|!:,.;]*[-A-Za-z0-9\+&@#/%=~_\|]'`
			`string=$1`
			`if [[ $string =~ $regex && $string != "<" && $string != ">" ]]`
			`then`

crawl websites 2020-03-19 12:19:11 +00:00			`echo visiting $1`
fix slep 2020-03-19 14:35:34 +00:00			`sleep 1`
crawl websites 2020-03-19 12:19:11 +00:00			`# get URLs`

			CRAWLED="./content/"`echo $1 \| shasum \| head -c 5`
			`echo $CRAWLED`
clean command for when you cancel crawler early 2020-03-19 20:07:34 +00:00			`curl $1 -m 1 -L -A "tser/1.0" > $TEMPIDC`
dont download videos lol 2020-03-19 12:35:22 +00:00
more agressive filtering of what is an actual site 2020-03-19 12:46:36 +00:00			`cat $TEMPIDC \| grep href=\" \| grep "https://" \| grep -o "https:\/\/[^\"]*" \| awk '/html$/ \|\| /php$/ \|\| /txt$/ \|\| /\/$/ { print $0 }' >> $2`
			`cat $TEMPIDC \| grep href=\" \| grep "https://" \| grep -o "https:\/\/[^\"]*" \| awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2`
dont download videos lol 2020-03-19 12:35:22 +00:00
check urls 2020-03-19 13:26:11 +00:00
crawl websites 2020-03-19 12:19:11 +00:00			`cat $TEMPIDC \| awk -v IGNORECASE=1 -v RS='</title' 'RT{gsub(/.<title[^>]>/,"");print;exit}' \| cleanxss > $CRAWLED`
			`echo $1 \| cleanxss >> $CRAWLED`
			`cat $TEMPIDC \| awk -v IGNORECASE=1 -v RS='</p' 'RT{gsub(/.<p[^>]>/,"");print}' \| cleanxss >> $CRAWLED`
check urls 2020-03-19 13:26:11 +00:00			`else`
			`echo "link $1 is invalid"`
			`fi`
crawl websites 2020-03-19 12:19:11 +00:00			`}`

allow the user to not specify a new url while crawling 2020-03-19 20:16:11 +00:00			`if [ ! -z $1 ];then`

			`SITE=$1`

crawl websites 2020-03-19 12:19:11 +00:00
check urls 2020-03-19 13:26:11 +00:00
crawl websites 2020-03-19 12:19:11 +00:00			`visit $SITE urls.txt`

allow the user to not specify a new url while crawling 2020-03-19 20:16:11 +00:00			`fi`

more aggressive url filtering 2020-06-25 13:42:07 +00:00			URLLIST=`cat urls.txt\|grep -v '<'\|grep -v ' '\|sort\|uniq`
crawl websites 2020-03-19 12:19:11 +00:00
check urls 2020-03-19 13:26:11 +00:00			`rm urls.txt`

			`for fn in $URLLIST; do`
crawl websites 2020-03-19 12:19:11 +00:00			`echo "found URL $fn"`
			`visit $fn urls.txt`
			`done`

remove duplicates 2020-03-19 20:34:38 +00:00			`mv urls.txt urls.txt.bak`
fixed cleaning script 2020-03-20 14:23:33 +00:00			`cat urls.txt.bak \| sort \| uniq -w 20 > urls.txt`
remove duplicates 2020-03-19 20:34:38 +00:00			`rm urls.txt.bak`
gitignore 2020-03-19 11:23:19 +00:00
um whoops that deleted all my data... 2020-03-20 00:58:13 +00:00			`find ./content/ -type 'f' -size -100c -delete`
timeout 2020-03-19 13:08:50 +00:00			`find ./content/ -type 'f' -size +15k -delete`
more agressive filtering of what is an actual site 2020-03-19 12:46:36 +00:00
check urls 2020-03-19 13:26:11 +00:00
clean command for when you cancel crawler early 2020-03-19 20:07:34 +00:00
wait a little so you dont get rate limited 2020-03-19 12:57:43 +00:00			`grep -lrIiZ "404 not found" ./content/ \| xargs -0 rm --`
clean command for when you cancel crawler early 2020-03-19 20:07:34 +00:00			`grep -lrIiZ "403 forbidden" ./content/ \| xargs -0 rm --`

gitignore 2020-03-19 11:23:19 +00:00