This commit is contained in:
lickthecheese 2020-03-19 10:35:34 -04:00
parent 1f83896371
commit d94901f6f2
1 changed files with 2 additions and 3 deletions

5
crawly
View File

@ -16,12 +16,12 @@ if [[ $string =~ $regex && $string != *"<"* && $string != *">"* ]]
then
echo visiting $1
sleep 1
# get URLs
CRAWLED="./content/"`echo $1 | shasum | head -c 5`
echo $CRAWLED
curl $1 -m 5 -L > $TEMPIDC
curl $1 -m 1 -L > $TEMPIDC
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '/html$/ || /php$/ || /txt$/ || /\/$/ { print $0 }' >> $2
cat $TEMPIDC | grep href=\" | grep "https://" | grep -o "https:\/\/[^\"]*" | awk '!/html$/ && !/php$/ && !/txt$/ && !/\/$/ { print $0"/" }' >> $2
@ -45,7 +45,6 @@ rm urls.txt
for fn in $URLLIST; do
echo "found URL $fn"
sleep 1
visit $fn urls.txt
done