open-journal-matcher/compare.py

229 lines
6.7 KiB
Python

""" run the comparisons using asyncio """
import asyncio
import asks
import concurrent
import regex
import settings202201 as settings
import aiohttp
import langdetect
import os
import schedule
import spacy
from time import sleep
from flask_bootstrap import Bootstrap
from collections import OrderedDict
from flask_wtf import FlaskForm
from wtforms import TextAreaField, SubmitField
from wtforms.validators import Length, ValidationError
from flask import Flask, render_template, request, url_for, Response, abort
from datetime import datetime
from redislite import StrictRedis
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "lemmatizer"])
app = Flask(__name__, static_url_path="/static")
Bootstrap(app)
app.config["SECRET_KEY"] = settings.csrf
REDIS = os.path.join("/tmp/redis-dev.db")
r = StrictRedis(REDIS, charset="utf-8", decode_responses=True)
r.hset("counter", "increment", 0)
def reset_redis():
r.hset("counter", "increment", 0)
schedule.every().hour.do(reset_redis)
class WebForm(FlaskForm):
""" for validation """
webabstract = TextAreaField(
validators=[
Length(
min=150,
max=10000,
message="Your abstract must be between 150 and 10,000 characters.",
)
]
)
def validate_webabstract(form, field):
try:
language = langdetect.detect(field.data)
except langdetect.lang_detect_exception.LangDetectException:
raise ValidationError(
"Your abstract must be between 150 and 10,000 characters."
)
print(language)
if language != "en":
raise ValidationError(
"The Open Journal Matcher only works with abstracts written in English."
)
submit = SubmitField("Search")
@app.route("/", methods=["GET", "POST"])
def index():
""" display index page """
form = WebForm()
valid = form.validate_on_submit()
schedule.run_pending()
if request.method == "POST" and valid:
# check to ensure not over rate limit
counter = int(r.hget("counter", "increment"))
counter += 1
print("counter:", counter)
if counter >= 10:
rate_error = {
"webabstract": [
"The application is experiencing peak load. Please try again later."
]
}
print("Turnaway due to load")
return render_template(
"index.html", form=form, errors=rate_error, output=""
)
r.hset("counter", "increment", counter)
# lay the groundwork
comp = {}
unordered_scores = {}
inp = form.webabstract.data
t0 = datetime.now()
# do the work
asyncio.run(parent1(inp, comp))
asyncio.run(parent2(comp, unordered_scores))
# sort the results
scores = OrderedDict(
sorted(unordered_scores.items(), key=lambda t: t[0], reverse=True)
)
# calculate running time
t1 = datetime.now()
print(t1 - t0)
return render_template("index.html", form=form, errors={}, output=scores)
elif request.method == "POST" and not valid:
return render_template("index.html", form=form, errors=form.errors, output="")
else:
return render_template("index.html", form=form, errors={}, output="")
@app.after_request
def add_security_headers(resp):
resp.headers["X-Content-Type-Options"] = "nosniff"
resp.headers["X-Frame-Options"] = "SAMEORIGIN"
resp.headers["X-XSS-Protection"] = "1; mode=block"
resp.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
resp.headers[
"Content-Security-Policy"
] = "script-src 'self'; style-src 'self'; default-src 'none'"
return resp
async def parent1(inp, comp):
""" manage the async calls to GCP """
await asyncio.gather(
*[cloud_work(blob, inp, comp, 0) for blob in settings.bucket_list]
)
return
async def cloud_work(blob, inp, comp, count):
""" interact with google cloud function """
max_out = 0
try:
async with aiohttp.ClientSession() as session:
while max_out < 6:
async with session.post(
settings.cloud_function,
json = {'inp': inp},
headers = {'blob': blob},
) as resp:
if resp.status != 200:
print(resp.status)
if max_out >= 5:
raise Exception("Max out")
if resp.status == 200:
comp[blob] = await resp.text()
break
elif resp.status in {500, 503, 429}:
sleep(0.001)
max_out += 1
else:
raise Exception(str(resp.status))
except (
aiohttp.client_exceptions.ClientConnectorError,
aiohttp.client_exceptions.ServerDisconnectedError,
asyncio.TimeoutError,
concurrent.futures._base.CancelledError,
) as e:
print(type(e), e, str(count))
if count < 3:
await cloud_work(blob, inp, comp, count + 1)
except Exception as e:
print(type(e), e)
return
async def parent2(comp, unordered_scores):
""" manage the async calls to the DOAJ api """
# test for validity
to_sort = [(k, v) for k, v in comp.items() if test_response(v)]
print("Journals checked:" + str(len(to_sort)))
# this sort is needed to reduce API calls to doaj.org
top = sorted(to_sort, key=lambda x: x[1], reverse=True)[:5]
# make calls to the doaj API asynchronously
await asyncio.gather(
*[titles(idx, item, unordered_scores) for idx, item in enumerate(top)]
)
return
def test_response(resp):
""" some abstract collections raise ValueErrors. Ignore these """
try:
return float(resp) # will evaluate as false if float == 0.0
except ValueError:
return False
async def titles(idx, item, unordered_scores):
if regex.match(r"^[0-9]{4}-[0-9]{3}[0-9Xx]$", item[0]):
issn = item[0]
else:
raise Exception("ISSN does not match regex")
journal_data = await asks.get(
"https://doaj.org/api/v2/search/journals/issn%3A" + issn
)
journal_json = journal_data.json()
try:
title = journal_json["results"][0]["bibjson"]["title"]
if title[-1:] == " ":
title = title[:-1]
url = "https://doaj.org/toc/" + issn
except:
title = "Title lookup failed. Try finding this item by ISSN instead.."
url = ""
score = float(item[1]) * 100
unordered_scores[score] = (title, issn, url)
return
if __name__ == "__main__":
app.run()