229 lines
6.7 KiB
Python
229 lines
6.7 KiB
Python
""" run the comparisons using asyncio """
|
|
|
|
import asyncio
|
|
import asks
|
|
import concurrent
|
|
import regex
|
|
import settings202201 as settings
|
|
import aiohttp
|
|
import langdetect
|
|
import os
|
|
import schedule
|
|
import spacy
|
|
from time import sleep
|
|
from flask_bootstrap import Bootstrap
|
|
from collections import OrderedDict
|
|
from flask_wtf import FlaskForm
|
|
from wtforms import TextAreaField, SubmitField
|
|
from wtforms.validators import Length, ValidationError
|
|
from flask import Flask, render_template, request, url_for, Response, abort
|
|
from datetime import datetime
|
|
from redislite import StrictRedis
|
|
|
|
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "lemmatizer"])
|
|
|
|
app = Flask(__name__, static_url_path="/static")
|
|
Bootstrap(app)
|
|
app.config["SECRET_KEY"] = settings.csrf
|
|
REDIS = os.path.join("/tmp/redis-dev.db")
|
|
r = StrictRedis(REDIS, charset="utf-8", decode_responses=True)
|
|
r.hset("counter", "increment", 0)
|
|
|
|
|
|
def reset_redis():
|
|
r.hset("counter", "increment", 0)
|
|
|
|
|
|
schedule.every().hour.do(reset_redis)
|
|
|
|
|
|
class WebForm(FlaskForm):
|
|
""" for validation """
|
|
|
|
webabstract = TextAreaField(
|
|
validators=[
|
|
Length(
|
|
min=150,
|
|
max=10000,
|
|
message="Your abstract must be between 150 and 10,000 characters.",
|
|
)
|
|
]
|
|
)
|
|
|
|
def validate_webabstract(form, field):
|
|
try:
|
|
language = langdetect.detect(field.data)
|
|
except langdetect.lang_detect_exception.LangDetectException:
|
|
raise ValidationError(
|
|
"Your abstract must be between 150 and 10,000 characters."
|
|
)
|
|
print(language)
|
|
if language != "en":
|
|
raise ValidationError(
|
|
"The Open Journal Matcher only works with abstracts written in English."
|
|
)
|
|
|
|
submit = SubmitField("Search")
|
|
|
|
|
|
@app.route("/", methods=["GET", "POST"])
|
|
def index():
|
|
""" display index page """
|
|
form = WebForm()
|
|
valid = form.validate_on_submit()
|
|
schedule.run_pending()
|
|
if request.method == "POST" and valid:
|
|
|
|
# check to ensure not over rate limit
|
|
counter = int(r.hget("counter", "increment"))
|
|
counter += 1
|
|
print("counter:", counter)
|
|
if counter >= 10:
|
|
rate_error = {
|
|
"webabstract": [
|
|
"The application is experiencing peak load. Please try again later."
|
|
]
|
|
}
|
|
print("Turnaway due to load")
|
|
return render_template(
|
|
"index.html", form=form, errors=rate_error, output=""
|
|
)
|
|
r.hset("counter", "increment", counter)
|
|
|
|
# lay the groundwork
|
|
comp = {}
|
|
unordered_scores = {}
|
|
inp = form.webabstract.data
|
|
t0 = datetime.now()
|
|
|
|
# do the work
|
|
asyncio.run(parent1(inp, comp))
|
|
asyncio.run(parent2(comp, unordered_scores))
|
|
|
|
# sort the results
|
|
scores = OrderedDict(
|
|
sorted(unordered_scores.items(), key=lambda t: t[0], reverse=True)
|
|
)
|
|
|
|
# calculate running time
|
|
t1 = datetime.now()
|
|
print(t1 - t0)
|
|
|
|
return render_template("index.html", form=form, errors={}, output=scores)
|
|
|
|
elif request.method == "POST" and not valid:
|
|
return render_template("index.html", form=form, errors=form.errors, output="")
|
|
|
|
else:
|
|
return render_template("index.html", form=form, errors={}, output="")
|
|
|
|
|
|
@app.after_request
|
|
def add_security_headers(resp):
|
|
resp.headers["X-Content-Type-Options"] = "nosniff"
|
|
resp.headers["X-Frame-Options"] = "SAMEORIGIN"
|
|
resp.headers["X-XSS-Protection"] = "1; mode=block"
|
|
resp.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
|
resp.headers[
|
|
"Content-Security-Policy"
|
|
] = "script-src 'self'; style-src 'self'; default-src 'none'"
|
|
return resp
|
|
|
|
|
|
async def parent1(inp, comp):
|
|
""" manage the async calls to GCP """
|
|
await asyncio.gather(
|
|
*[cloud_work(blob, inp, comp, 0) for blob in settings.bucket_list]
|
|
)
|
|
return
|
|
|
|
|
|
async def cloud_work(blob, inp, comp, count):
|
|
""" interact with google cloud function """
|
|
max_out = 0
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
while max_out < 6:
|
|
async with session.post(
|
|
settings.cloud_function,
|
|
json = {'inp': inp},
|
|
headers = {'blob': blob},
|
|
) as resp:
|
|
if resp.status != 200:
|
|
print(resp.status)
|
|
if max_out >= 5:
|
|
raise Exception("Max out")
|
|
if resp.status == 200:
|
|
comp[blob] = await resp.text()
|
|
break
|
|
elif resp.status in {500, 503, 429}:
|
|
sleep(0.001)
|
|
max_out += 1
|
|
else:
|
|
raise Exception(str(resp.status))
|
|
except (
|
|
aiohttp.client_exceptions.ClientConnectorError,
|
|
aiohttp.client_exceptions.ServerDisconnectedError,
|
|
asyncio.TimeoutError,
|
|
concurrent.futures._base.CancelledError,
|
|
) as e:
|
|
print(type(e), e, str(count))
|
|
if count < 3:
|
|
await cloud_work(blob, inp, comp, count + 1)
|
|
except Exception as e:
|
|
print(type(e), e)
|
|
return
|
|
|
|
|
|
async def parent2(comp, unordered_scores):
|
|
""" manage the async calls to the DOAJ api """
|
|
|
|
# test for validity
|
|
to_sort = [(k, v) for k, v in comp.items() if test_response(v)]
|
|
print("Journals checked:" + str(len(to_sort)))
|
|
|
|
# this sort is needed to reduce API calls to doaj.org
|
|
top = sorted(to_sort, key=lambda x: x[1], reverse=True)[:5]
|
|
|
|
# make calls to the doaj API asynchronously
|
|
await asyncio.gather(
|
|
*[titles(idx, item, unordered_scores) for idx, item in enumerate(top)]
|
|
)
|
|
return
|
|
|
|
|
|
def test_response(resp):
|
|
""" some abstract collections raise ValueErrors. Ignore these """
|
|
try:
|
|
return float(resp) # will evaluate as false if float == 0.0
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
async def titles(idx, item, unordered_scores):
|
|
if regex.match(r"^[0-9]{4}-[0-9]{3}[0-9Xx]$", item[0]):
|
|
issn = item[0]
|
|
else:
|
|
raise Exception("ISSN does not match regex")
|
|
|
|
journal_data = await asks.get(
|
|
"https://doaj.org/api/v2/search/journals/issn%3A" + issn
|
|
)
|
|
journal_json = journal_data.json()
|
|
|
|
try:
|
|
title = journal_json["results"][0]["bibjson"]["title"]
|
|
if title[-1:] == " ":
|
|
title = title[:-1]
|
|
url = "https://doaj.org/toc/" + issn
|
|
except:
|
|
title = "Title lookup failed. Try finding this item by ISSN instead.."
|
|
url = ""
|
|
score = float(item[1]) * 100
|
|
unordered_scores[score] = (title, issn, url)
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|