cleanup; add draft flask implementation

This commit is contained in:
Mark Eaton 2020-01-31 21:06:41 -05:00
parent 068c5f2966
commit 31e869751f
7 changed files with 90 additions and 134 deletions

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2019
Copyright (c) 2019 Mark Eaton
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -1,10 +1,6 @@
# A journal recommender tool built on the Directory of Open Access Journals
This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs work.
The main problem at this point is that it is very slow. The `compare-*.py` files are iterative attempts at addressing the slowness. They still have bugs.
Some systematic profiling of the various solutions to speed up the code needs to be done. Hopefully this can be added soon.
This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs quite a bit of work. Ultimately, the aim is to have a Flask application combined with "serverless" infrastructure for data analysis.
Presented at the 18th Annual CUNY IT Conference. New York, NY. December 5, 2019.

View File

@ -1,53 +0,0 @@
""" run the comparisons using the most basic approach """
import spacy
import glob
import requests
from spacy.tokens import Doc
from datetime import datetime
nlp = spacy.load("en_core_web_md")
comp = {}
inp = input("Abstract: ")
abs_data = nlp(inp)
counter = 0
result = []
t0 = datetime.now()
def fio(item):
with open(item, "rb") as item_data:
data = Doc(nlp.vocab).from_bytes(item_data.read())
print(abs_data.similarity(data))
return (abs_data.similarity(data), item[-9:])
gl = list(glob.glob("docs-md/*"))
for item in gl:
result.append(fio(item))
print("sorting")
top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
print("get journal info from API")
for item in top:
journal_data = requests.get(
"https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
)
issn = item[1]
score = item[0]
if journal_data.status_code == 200:
journal_json = journal_data.json()
try:
title = journal_json["results"][0]["bibjson"]["title"]
except:
title = " "
print(issn, title, score)
else:
print(issn, score)
t1 = datetime.now()
print(t1 - t0)

70
compare-flask.py Normal file
View File

@ -0,0 +1,70 @@
""" run the comparisons in a flask app """
import spacy
import glob
import collections
import requests
from spacy.tokens import Doc
from flask import Flask, render_template, request
from wtforms import Form, StringField, validators
nlp = spacy.load("en_core_web_md")
comp = {}
app = Flask(__name__)
class RegistrationForm(Form):
abstract = StringField(
"abstract",
[
validators.length(
min=25,
max=10000,
message="Your abstract must be between 25 and 10000 characters.",
)
],
)
@app.route("/", methods=["GET", "POST"])
def index():
""" display index page """
form = RegistrationForm(request.form)
if request.method == "POST" and form.validate():
abs_data = nlp(form.abstract.data)
counter = 0
for item in glob.glob("docs/*"):
counter += 1
print(item, counter)
with open(item, "rb") as item_data:
data = Doc(nlp.vocab).from_bytes(item_data.read())
print(abs_data.similarity(data))
comp[item[5:]] = abs_data.similarity(data)
print("sorting")
top = sorted(comp.items(), key=lambda x: x[1], reverse=True)[:5]
print("get journal info from API")
for item in top:
journal_data = requests.get(
"https://doaj.org/api/v1/search/journals/issn%3A" + item[0]
)
journal_json = journal_data.json()
title = journal_json["results"][0]["bibjson"]["title"]
issn = item[0]
score = item[1]
print(issn, title)
return render_template("index.html", title=title, issn=issn, score=score)
elif request.method == "POST" and not form.validate():
return render_template("index.html", error_message=form.errors["abstract"][0])
else:
return render_template("index.html")
if __name__ == "__main__":
app.run(port=8000, host="127.0.0.1", debug=True)

View File

@ -1,56 +0,0 @@
""" run the comparisons using multiprocessing """
import spacy
import glob
import requests
import multiprocessing
from spacy.tokens import Doc
from datetime import datetime
nlp = spacy.load("en_core_web_md")
comp = {}
inp = input("Abstract: ")
abs_data = nlp(inp)
counter = 0
t0 = datetime.now()
def fio(item):
with open(item, "rb") as item_data:
data = Doc(nlp.vocab).from_bytes(item_data.read())
print(abs_data.similarity(data))
return (abs_data.similarity(data), item[-9:])
pool = multiprocessing.Pool(16)
gl = list(glob.glob("docs-md/*"))
result = pool.map(fio, gl)
pool.close()
pool.join()
print("sorting")
top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
print("get journal info from API")
for item in top:
journal_data = requests.get(
"https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
)
issn = item[1]
score = item[0]
if journal_data.status_code == 200:
journal_json = journal_data.json()
try:
title = journal_json["results"][0]["bibjson"]["title"]
except:
title = " "
print(issn, title, score)
else:
print(issn, score)
t1 = datetime.now()
print(t1 - t0)

19
sp.py
View File

@ -1,19 +0,0 @@
""" write out the vectorized journals to docs/ directory """
import json
import spacy
from pathlib import Path
nlp = spacy.load("en_core_web_md")
counter = 0
pathlist = Path("abstracts/").glob("*.txt")
for path in list(pathlist):
with open(str(path)) as ab:
data = json.loads(ab.read())
counter += 1
print("processing: " + str(path) + " #" + str(counter))
outpath = str(path)[10:19]
with open("docs/{}".format(outpath), "wb") as outfile:
n_data = nlp(data[:99999])
outfile.write(n_data.to_bytes())

18
templates/index.html Normal file
View File

@ -0,0 +1,18 @@
<!DOCTYPE html>
<html>
<head>
<title>Open Source Journal Matcher</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
</head>
<body>
<h1>Open Source Journal Matcher</h1>
<form id=form action="{{ url_for('index') }}" method="POST">
<input type="text" name="abstract" />
</form>
<div id=errors>{{ error_message }}</div>
<div id=data style="display:none;">The best match is:<br/>
<strong>{{ title }}</strong> (issn: {{ issn }}), with a relevancy score of: <strong>{{ score }}</strong>.
</div>
</body>
</html>