pull journal binaries from gcp

2022-01-18 02:33:47 -05:00 · 2022-01-18 02:33:47 -05:00 · 1f746bf39b
parent ab8442964a
commit 1f746bf39b
2 changed files with 22 additions and 21 deletions
--- a/cloud_function.py
+++ b/cloud_function.py
@ -1,33 +1,35 @@
+import json
 import os
 import spacy
-from flask import Response, request
+from flask import request
 from google.cloud import storage
 from spacy.tokens import Doc
-from thinc.api import Config
+from spacy.vocab import Vocab

+nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"])

 def doaj_trio(request):
    try:
-        user_nlp = request.data
+        encoded_data = request.data
+        string_data = encoded_data.decode()
+        data = json.loads(string_data)
+        user_nlp = nlp(data['inp'])
+
        blob = request.headers.get('blob')

        client = storage.Client()
        bucket = client.get_bucket(os.environ['bucket'])

-        config_blob = bucket.get_blob(os.environ['config'])
-        config_text = config_blob.download_as_text()
-        config = Config().from_str(config_text)
-
        blob_object = bucket.get_blob(blob)
-        journal_nlp = blob_object.download_as_bytes()
+        blob_bytes = blob_object.download_as_bytes()
+        blob_vocab_object = bucket.get_blob(blob + '-vocab')
+        blob_vocab_bytes = blob_vocab_object.download_as_bytes()
+        journal_nlp = Doc(Vocab()).from_bytes(blob_bytes)
+        journal_nlp.vocab.from_bytes(blob_vocab_bytes)

-        lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
-        nlp = lang_cls.from_config(config)
-        user_sim = Doc(nlp.vocab).from_bytes(user_nlp)
-        journal_sim = Doc(nlp.vocab).from_bytes(journal_nlp)
-
-        sim = user_sim.similarity(journal_sim)
+        sim = user_nlp.similarity(journal_nlp)
        return str(sim)

    except:
        raise
+
--- a/compare.py
+++ b/compare.py
@ -3,7 +3,7 @@
 import asyncio
 import asks
 import regex
-import settings202109 as settings
+import settings202201 as settings
 import aiohttp
 import langdetect
 import os
@ -94,10 +94,9 @@ def index():
        unordered_scores = {}
        inp = form.webabstract.data
        t0 = datetime.now()
-        user_nlp = nlp(inp).to_bytes()

        # do the work
-        asyncio.run(parent1(user_nlp, comp))
+        asyncio.run(parent1(inp, comp))
        asyncio.run(parent2(comp, unordered_scores))

        # sort the results
@ -130,15 +129,15 @@ def add_security_headers(resp):
    return resp


-async def parent1(user_nlp, comp):
+async def parent1(inp, comp):
    """ manage the async calls to GCP """
    await asyncio.gather(
-            *[cloud_work(blob, user_nlp, comp, 0) for blob in settings.bucket_list[:10]]
+            *[cloud_work(blob, inp, comp, 0) for blob in settings.bucket_list]
    )
    return


-async def cloud_work(blob, user_nlp, comp, count):
+async def cloud_work(blob, inp, comp, count):
    """ interact with google cloud function """
    max_out = 0
    try:
@ -146,7 +145,7 @@ async def cloud_work(blob, user_nlp, comp, count):
            while max_out < 6:
                async with session.post(
                    settings.cloud_function,
-                    data = user_nlp,
+                    json = {'inp': inp},
                    headers = {'blob': blob},
                ) as resp:
                    if max_out >= 5: