cleanup; add draft flask implementation
This commit is contained in:
parent
068c5f2966
commit
31e869751f
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2019
|
||||
Copyright (c) 2019 Mark Eaton
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
# A journal recommender tool built on the Directory of Open Access Journals
|
||||
|
||||
This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs work.
|
||||
|
||||
The main problem at this point is that it is very slow. The `compare-*.py` files are iterative attempts at addressing the slowness. They still have bugs.
|
||||
|
||||
Some systematic profiling of the various solutions to speed up the code needs to be done. Hopefully this can be added soon.
|
||||
This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs quite a bit of work. Ultimately, the aim is to have a Flask application combined with "serverless" infrastructure for data analysis.
|
||||
|
||||
Presented at the 18th Annual CUNY IT Conference. New York, NY. December 5, 2019.
|
||||
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
""" run the comparisons using the most basic approach """
|
||||
|
||||
import spacy
|
||||
import glob
|
||||
import requests
|
||||
from spacy.tokens import Doc
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
comp = {}
|
||||
|
||||
inp = input("Abstract: ")
|
||||
abs_data = nlp(inp)
|
||||
counter = 0
|
||||
result = []
|
||||
|
||||
t0 = datetime.now()
|
||||
|
||||
|
||||
def fio(item):
|
||||
with open(item, "rb") as item_data:
|
||||
data = Doc(nlp.vocab).from_bytes(item_data.read())
|
||||
print(abs_data.similarity(data))
|
||||
return (abs_data.similarity(data), item[-9:])
|
||||
|
||||
|
||||
gl = list(glob.glob("docs-md/*"))
|
||||
for item in gl:
|
||||
result.append(fio(item))
|
||||
|
||||
print("sorting")
|
||||
top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
|
||||
|
||||
print("get journal info from API")
|
||||
for item in top:
|
||||
journal_data = requests.get(
|
||||
"https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
|
||||
)
|
||||
issn = item[1]
|
||||
score = item[0]
|
||||
if journal_data.status_code == 200:
|
||||
journal_json = journal_data.json()
|
||||
try:
|
||||
title = journal_json["results"][0]["bibjson"]["title"]
|
||||
except:
|
||||
title = " "
|
||||
print(issn, title, score)
|
||||
else:
|
||||
print(issn, score)
|
||||
|
||||
t1 = datetime.now()
|
||||
print(t1 - t0)
|
|
@ -0,0 +1,70 @@
|
|||
""" run the comparisons in a flask app """
|
||||
|
||||
import spacy
|
||||
import glob
|
||||
import collections
|
||||
import requests
|
||||
from spacy.tokens import Doc
|
||||
from flask import Flask, render_template, request
|
||||
from wtforms import Form, StringField, validators
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
comp = {}
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
class RegistrationForm(Form):
|
||||
abstract = StringField(
|
||||
"abstract",
|
||||
[
|
||||
validators.length(
|
||||
min=25,
|
||||
max=10000,
|
||||
message="Your abstract must be between 25 and 10000 characters.",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@app.route("/", methods=["GET", "POST"])
|
||||
def index():
|
||||
""" display index page """
|
||||
form = RegistrationForm(request.form)
|
||||
if request.method == "POST" and form.validate():
|
||||
|
||||
abs_data = nlp(form.abstract.data)
|
||||
counter = 0
|
||||
|
||||
for item in glob.glob("docs/*"):
|
||||
counter += 1
|
||||
print(item, counter)
|
||||
with open(item, "rb") as item_data:
|
||||
data = Doc(nlp.vocab).from_bytes(item_data.read())
|
||||
print(abs_data.similarity(data))
|
||||
comp[item[5:]] = abs_data.similarity(data)
|
||||
|
||||
print("sorting")
|
||||
top = sorted(comp.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
|
||||
print("get journal info from API")
|
||||
for item in top:
|
||||
journal_data = requests.get(
|
||||
"https://doaj.org/api/v1/search/journals/issn%3A" + item[0]
|
||||
)
|
||||
journal_json = journal_data.json()
|
||||
title = journal_json["results"][0]["bibjson"]["title"]
|
||||
issn = item[0]
|
||||
score = item[1]
|
||||
print(issn, title)
|
||||
return render_template("index.html", title=title, issn=issn, score=score)
|
||||
|
||||
elif request.method == "POST" and not form.validate():
|
||||
return render_template("index.html", error_message=form.errors["abstract"][0])
|
||||
|
||||
else:
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(port=8000, host="127.0.0.1", debug=True)
|
|
@ -1,56 +0,0 @@
|
|||
""" run the comparisons using multiprocessing """
|
||||
|
||||
import spacy
|
||||
import glob
|
||||
import requests
|
||||
import multiprocessing
|
||||
from spacy.tokens import Doc
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
comp = {}
|
||||
|
||||
inp = input("Abstract: ")
|
||||
abs_data = nlp(inp)
|
||||
counter = 0
|
||||
|
||||
t0 = datetime.now()
|
||||
|
||||
|
||||
def fio(item):
|
||||
with open(item, "rb") as item_data:
|
||||
data = Doc(nlp.vocab).from_bytes(item_data.read())
|
||||
print(abs_data.similarity(data))
|
||||
return (abs_data.similarity(data), item[-9:])
|
||||
|
||||
|
||||
pool = multiprocessing.Pool(16)
|
||||
|
||||
gl = list(glob.glob("docs-md/*"))
|
||||
result = pool.map(fio, gl)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
print("sorting")
|
||||
top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
|
||||
|
||||
print("get journal info from API")
|
||||
for item in top:
|
||||
journal_data = requests.get(
|
||||
"https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
|
||||
)
|
||||
issn = item[1]
|
||||
score = item[0]
|
||||
if journal_data.status_code == 200:
|
||||
journal_json = journal_data.json()
|
||||
try:
|
||||
title = journal_json["results"][0]["bibjson"]["title"]
|
||||
except:
|
||||
title = " "
|
||||
print(issn, title, score)
|
||||
else:
|
||||
print(issn, score)
|
||||
|
||||
t1 = datetime.now()
|
||||
print(t1 - t0)
|
19
sp.py
19
sp.py
|
@ -1,19 +0,0 @@
|
|||
""" write out the vectorized journals to docs/ directory """
|
||||
|
||||
import json
|
||||
import spacy
|
||||
from pathlib import Path
|
||||
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
counter = 0
|
||||
|
||||
pathlist = Path("abstracts/").glob("*.txt")
|
||||
for path in list(pathlist):
|
||||
with open(str(path)) as ab:
|
||||
data = json.loads(ab.read())
|
||||
counter += 1
|
||||
print("processing: " + str(path) + " #" + str(counter))
|
||||
outpath = str(path)[10:19]
|
||||
with open("docs/{}".format(outpath), "wb") as outfile:
|
||||
n_data = nlp(data[:99999])
|
||||
outfile.write(n_data.to_bytes())
|
|
@ -0,0 +1,18 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Open Source Journal Matcher</title>
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Open Source Journal Matcher</h1>
|
||||
<form id=form action="{{ url_for('index') }}" method="POST">
|
||||
<input type="text" name="abstract" />
|
||||
</form>
|
||||
<div id=errors>{{ error_message }}</div>
|
||||
<div id=data style="display:none;">The best match is:<br/>
|
||||
<strong>{{ title }}</strong> (issn: {{ issn }}), with a relevancy score of: <strong>{{ score }}</strong>.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue