cleanup; add draft flask implementation

2020-01-31 21:06:41 -05:00 · 2020-01-31 21:06:41 -05:00 · 31e869751f
parent 068c5f2966
commit 31e869751f
7 changed files with 90 additions and 134 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2019 
+Copyright (c) 2019 Mark Eaton

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@ -1,10 +1,6 @@
 # A journal recommender tool built on the Directory of Open Access Journals

-This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs work.
-
-The main problem at this point is that it is very slow. The `compare-*.py` files are iterative attempts at addressing the slowness. They still have bugs.
-
-Some systematic profiling of the various solutions to speed up the code needs to be done. Hopefully this can be added soon.
+This is a work in progress. The goal is to create a journal recommender to give journal suggestions based on a draft abstract. It still needs quite a bit of work. Ultimately, the aim is to have a Flask application combined with "serverless" infrastructure for data analysis.

 Presented at the 18th Annual CUNY IT Conference. New York, NY. December 5, 2019.

--- a/compare-basic.py
+++ b/compare-basic.py
@ -1,53 +0,0 @@
-""" run the comparisons using the most basic approach """
-
-import spacy
-import glob
-import requests
-from spacy.tokens import Doc
-from datetime import datetime
-
-
-nlp = spacy.load("en_core_web_md")
-comp = {}
-
-inp = input("Abstract: ")
-abs_data = nlp(inp)
-counter = 0
-result = []
-
-t0 = datetime.now()
-
-
-def fio(item):
-    with open(item, "rb") as item_data:
-        data = Doc(nlp.vocab).from_bytes(item_data.read())
-        print(abs_data.similarity(data))
-        return (abs_data.similarity(data), item[-9:])
-
-
-gl = list(glob.glob("docs-md/*"))
-for item in gl:
-    result.append(fio(item))
-
-print("sorting")
-top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
-
-print("get journal info from API")
-for item in top:
-    journal_data = requests.get(
-        "https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
-    )
-    issn = item[1]
-    score = item[0]
-    if journal_data.status_code == 200:
-        journal_json = journal_data.json()
-        try:
-            title = journal_json["results"][0]["bibjson"]["title"]
-        except:
-            title = " "
-        print(issn, title, score)
-    else:
-        print(issn, score)
-
-t1 = datetime.now()
-print(t1 - t0)
--- a/compare-flask.py
+++ b/compare-flask.py
@ -0,0 +1,70 @@
+""" run the comparisons in a flask app """
+
+import spacy
+import glob
+import collections
+import requests
+from spacy.tokens import Doc
+from flask import Flask, render_template, request
+from wtforms import Form, StringField, validators
+
+nlp = spacy.load("en_core_web_md")
+comp = {}
+
+app = Flask(__name__)
+
+
+class RegistrationForm(Form):
+    abstract = StringField(
+        "abstract",
+        [
+            validators.length(
+                min=25,
+                max=10000,
+                message="Your abstract must be between 25 and 10000 characters.",
+            )
+        ],
+    )
+
+
+@app.route("/", methods=["GET", "POST"])
+def index():
+    """ display index page """
+    form = RegistrationForm(request.form)
+    if request.method == "POST" and form.validate():
+
+        abs_data = nlp(form.abstract.data)
+        counter = 0
+
+        for item in glob.glob("docs/*"):
+            counter += 1
+            print(item, counter)
+            with open(item, "rb") as item_data:
+                data = Doc(nlp.vocab).from_bytes(item_data.read())
+                print(abs_data.similarity(data))
+                comp[item[5:]] = abs_data.similarity(data)
+
+        print("sorting")
+        top = sorted(comp.items(), key=lambda x: x[1], reverse=True)[:5]
+
+        print("get journal info from API")
+        for item in top:
+            journal_data = requests.get(
+                "https://doaj.org/api/v1/search/journals/issn%3A" + item[0]
+            )
+            journal_json = journal_data.json()
+            title = journal_json["results"][0]["bibjson"]["title"]
+            issn = item[0]
+            score = item[1]
+            print(issn, title)
+        return render_template("index.html", title=title, issn=issn, score=score)
+
+    elif request.method == "POST" and not form.validate():
+        return render_template("index.html", error_message=form.errors["abstract"][0])
+
+    else:
+        return render_template("index.html")
+
+
+if __name__ == "__main__":
+    app.run(port=8000, host="127.0.0.1", debug=True)
--- a/compare-multi.py
+++ b/compare-multi.py
@ -1,56 +0,0 @@
-""" run the comparisons using multiprocessing """
-
-import spacy
-import glob
-import requests
-import multiprocessing
-from spacy.tokens import Doc
-from datetime import datetime
-
-
-nlp = spacy.load("en_core_web_md")
-comp = {}
-
-inp = input("Abstract: ")
-abs_data = nlp(inp)
-counter = 0
-
-t0 = datetime.now()
-
-
-def fio(item):
-    with open(item, "rb") as item_data:
-        data = Doc(nlp.vocab).from_bytes(item_data.read())
-        print(abs_data.similarity(data))
-        return (abs_data.similarity(data), item[-9:])
-
-
-pool = multiprocessing.Pool(16)
-
-gl = list(glob.glob("docs-md/*"))
-result = pool.map(fio, gl)
-pool.close()
-pool.join()
-
-print("sorting")
-top = sorted(result, key=lambda x: x[0], reverse=True)[:5]
-
-print("get journal info from API")
-for item in top:
-    journal_data = requests.get(
-        "https://doaj.org/api/v1/search/journals/issn%3A" + item[1]
-    )
-    issn = item[1]
-    score = item[0]
-    if journal_data.status_code == 200:
-        journal_json = journal_data.json()
-        try:
-            title = journal_json["results"][0]["bibjson"]["title"]
-        except:
-            title = " "
-        print(issn, title, score)
-    else:
-        print(issn, score)
-
-t1 = datetime.now()
-print(t1 - t0)
--- a/sp.py
+++ b/sp.py
@ -1,19 +0,0 @@
-""" write out the vectorized journals to docs/ directory """
-
-import json
-import spacy
-from pathlib import Path
-
-nlp = spacy.load("en_core_web_md")
-counter = 0
-
-pathlist = Path("abstracts/").glob("*.txt")
-for path in list(pathlist):
-    with open(str(path)) as ab:
-        data = json.loads(ab.read())
-    counter += 1
-    print("processing: " + str(path) + " #" + str(counter))
-    outpath = str(path)[10:19]
-    with open("docs/{}".format(outpath), "wb") as outfile:
-        n_data = nlp(data[:99999])
-        outfile.write(n_data.to_bytes())
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>Open Source Journal Matcher</title>
+        <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> 
+        <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
+    </head>
+    <body>
+        <h1>Open Source Journal Matcher</h1>
+        <form id=form action="{{ url_for('index') }}" method="POST">
+            <input type="text" name="abstract" />
+        </form>
+        <div id=errors>{{ error_message }}</div>
+        <div id=data style="display:none;">The best match is:<br/>
+            <strong>{{ title }}</strong> (issn: {{ issn }}), with a relevancy score of: <strong>{{ score }}</strong>.
+        </div>
+    </body>
+</html>