pull journal binaries from gcp

This commit is contained in:
Mark Eaton 2022-01-18 02:33:47 -05:00
parent ab8442964a
commit 1f746bf39b
2 changed files with 22 additions and 21 deletions

View File

@ -1,33 +1,35 @@
import json
import os
import spacy
from flask import Response, request
from flask import request
from google.cloud import storage
from spacy.tokens import Doc
from thinc.api import Config
from spacy.vocab import Vocab
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"])
def doaj_trio(request):
try:
user_nlp = request.data
encoded_data = request.data
string_data = encoded_data.decode()
data = json.loads(string_data)
user_nlp = nlp(data['inp'])
blob = request.headers.get('blob')
client = storage.Client()
bucket = client.get_bucket(os.environ['bucket'])
config_blob = bucket.get_blob(os.environ['config'])
config_text = config_blob.download_as_text()
config = Config().from_str(config_text)
blob_object = bucket.get_blob(blob)
journal_nlp = blob_object.download_as_bytes()
blob_bytes = blob_object.download_as_bytes()
blob_vocab_object = bucket.get_blob(blob + '-vocab')
blob_vocab_bytes = blob_vocab_object.download_as_bytes()
journal_nlp = Doc(Vocab()).from_bytes(blob_bytes)
journal_nlp.vocab.from_bytes(blob_vocab_bytes)
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
user_sim = Doc(nlp.vocab).from_bytes(user_nlp)
journal_sim = Doc(nlp.vocab).from_bytes(journal_nlp)
sim = user_sim.similarity(journal_sim)
sim = user_nlp.similarity(journal_nlp)
return str(sim)
except:
raise

View File

@ -3,7 +3,7 @@
import asyncio
import asks
import regex
import settings202109 as settings
import settings202201 as settings
import aiohttp
import langdetect
import os
@ -94,10 +94,9 @@ def index():
unordered_scores = {}
inp = form.webabstract.data
t0 = datetime.now()
user_nlp = nlp(inp).to_bytes()
# do the work
asyncio.run(parent1(user_nlp, comp))
asyncio.run(parent1(inp, comp))
asyncio.run(parent2(comp, unordered_scores))
# sort the results
@ -130,15 +129,15 @@ def add_security_headers(resp):
return resp
async def parent1(user_nlp, comp):
async def parent1(inp, comp):
""" manage the async calls to GCP """
await asyncio.gather(
*[cloud_work(blob, user_nlp, comp, 0) for blob in settings.bucket_list[:10]]
*[cloud_work(blob, inp, comp, 0) for blob in settings.bucket_list]
)
return
async def cloud_work(blob, user_nlp, comp, count):
async def cloud_work(blob, inp, comp, count):
""" interact with google cloud function """
max_out = 0
try:
@ -146,7 +145,7 @@ async def cloud_work(blob, user_nlp, comp, count):
while max_out < 6:
async with session.post(
settings.cloud_function,
data = user_nlp,
json = {'inp': inp},
headers = {'blob': blob},
) as resp:
if max_out >= 5: