open-journal-matcher/fetch_bytes.py

89 lines
2.6 KiB
Python

""" loop through the issns, gather abstracts and wite to abstracts/ """
import json
import os
import requests
import spacy
from time import sleep
from bs4 import BeautifulSoup
MONTH = "2021-12"
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])
def fetch(issn):
base_url = "https://doaj.org/api/v1/search/articles/issn%3A"
pagesize = "?pageSize=100&sort=year%3Adesc"
data = requests.get(base_url + issn + pagesize)
print(
"fetching data for "
+ issn
+ ". "
+ str(idx + 1)
+ "/"
+ str(len(issns))
+ ". status: "
+ str(data.status_code)
)
try:
articles = data.json().get("results")
except:
articles = ""
status = str(data.status_code)
if status == "429":
sleep(10)
print("forbidden")
articles = fetch(issn)
return articles
def parse(articles):
abstracts = ""
counter = 0
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
abstract = article["bibjson"]["abstract"]
abstract = BeautifulSoup(abstract, "lxml").text
abstracts = abstracts + " " + abstract
counter += 1
except KeyError:
pass
if abstracts and counter >= 10:
doc = nlp(abstracts)
doc_bytes = doc.to_bytes()
else:
doc = None
print("fail! " + str(counter))
return doc
if __name__ == "__main__":
with open("issnlist-" + MONTH + ".txt") as issnfile:
issns = json.loads(issnfile.read())
issns_output = []
for idx, issn in enumerate(issns[:200]):
if not os.path.exists("abstracts-" + MONTH + "/" + issn):
articles = fetch(issn)
doc = parse(articles)
if not doc:
# if the file does not exist but there is no data
pass
else:
# if the file does not exist and there is data
doc_bytes = doc.to_bytes()
with open("abstracts-" + MONTH + "/" + issn, "wb") as abstractfile:
abstractfile.write(doc_bytes)
os.makedirs("abstracts-" + MONTH + "/" + issn + "-vocab")
doc.vocab.to_disk("abstracts-" + MONTH + "/" + issn + "-vocab")
issns_output.append(issn)
else:
# if the file exists
issns_output.append(issn)
pass
nlp.config.to_disk("abstracts-" + MONTH + "/config.cfg")
with open("issns-" + MONTH + ".txt", "w") as issnfile:
issnfile.write(json.dumps(issns_output))