Initial commit

This commit is contained in:
Ricardo Mazeto 2020-01-15 14:28:43 -07:00
commit 2df16b4e52
2 changed files with 101 additions and 0 deletions

101
fiis-scrapper.sh Executable file
View File

@ -0,0 +1,101 @@
#!/usr/bin/sh
#echo "Downloading fiis.com.br/anual.html"
#wget -q fiis.com.br/anual -O anual.html
echo "Extracting tickers"
grep -Eo "<span class=\"ticker\">[A-Z0-9]+" anual.html \
| sed "s/<span class=\"ticker\">//g" \
> fiis.txt
echo "Downloading tickers"
l=$(wc -l fiis.txt)
#l=$(head -n 4 fiis.txt | wc -l)
i=1
#for f in $(head -n 4 fiis.txt); do
for f in $(cat fiis.txt); do
echo -en "\r$i/$l downloading $f";
wget -q -O "$f.fii.html" "fiis.com.br/$f";
i=$((i+1))
done
echo
echo "Extracting data"
for f in *.fii.*; do
(echo;
grep --color=never -Eoz "<tbody>.*</tbody>" $f \
| tr "\n" " " \
| sed "s/<\/td>/ /g;s/<td>//g;s/<tr>/\n/g;s/R$ / /g;s/<tbody>//g;s/<\/tr>//g;s/<\/tbody>//g;s/ / /g";
echo) > $(basename -s .html $f).txt
#rm $f
done
# Fix the parens.
for f in *fii.txt; do
sed -zi "s/\n \n / /g;s/\n/)\n(/g;s/( /(-/g;s/(-/[/g;s/(//g;s/\[/( /g" $f
done
echo "(define fiis (quote (" >> fiis.ss
for f in *fii.txt; do
html=$(basename -s .fii.txt $f).fii.html
fundname=$(cat $html \
| grep -Eo 'fund-name">.*</span>' \
| sed 's/fund-name">//g;s/<\/span>//g');
admname=$(cat $html \
| grep -Eo 'administrator-name">.*</span>' \
| sed 's/administrator-name">//g;s/<\/span>//g');
cnpj=$(cat $html \
| grep -Eo 'administrator-doc">.*</span>' \
| sed 's/administrator-doc">//g;s/<\/span>//g');
tel=$(cat $html \
| tr "\n" " " \
| grep -Eoz 'Telefone</span> <span class="value">[0-9)( -]*</span>' \
| sed 's/Telefone<\/span> <span class="value">//g;s/<\/span>//g')
numdecotas=$(cat $html\
| tr "\n" " " \
| grep -Eoz 'Número de Cotas</span> <span class="value">[0-9.]+</span>'\
| sed 's/Número de Cotas<\/span> <span class="value">//g;s/<\/span>//g'\
| sed 's/\.//g')
numdecotistas=$(cat $html\
| tr "\n" " " \
| grep -Eoz 'Número de Cotistas</span> <span class="value">[0-9.]+</span>'\
| sed 's/Número de Cotistas<\/span> <span class="value">//g;s/<\/span>//g'\
| sed 's/\.//g')
nomenopregao=$(cat $html\
| tr "\n" " " \
| grep -Eoz 'Nome no Pregão</span> <span class="value">[A-z]+</span>'\
| sed 's/Nome no Pregão<\/span> <span class="value">//g;s/<\/span>//g')
tipodofii=$(cat $html\
| tr "\n" " " \
| grep -Eoz 'Tipo do FII</span> <span class="value">[A-z:]+</span>'\
| sed 's/Tipo do FII<\/span> <span class="value">//g;s/<\/span>//g')
(echo "(("
echo "$(basename -s .fii.html $html)"
echo "\"$fundname\""
echo "\"$admname\""
echo "\"$cnpj\""
echo "\"$tel\""
echo "$numdecotas"
echo "$numdecotistas"
echo "\"$nomenopregao\""
echo "\"$tipodofii\""
#echo "\"$nomenopregao\""
#echo "\"$tipodofii\""
echo ")(("
cat $f
echo "))") >> fiis.ss
done
echo ")))" >> fiis.ss
# Get rid of those pesky \x00 characters.
sed -i "s/\x00//g" fiis.ss

0
readme.md Normal file
View File