pull journal binaries from gcp
This commit is contained in:
parent
ab8442964a
commit
1f746bf39b
|
@ -1,33 +1,35 @@
|
|||
import json
|
||||
import os
|
||||
import spacy
|
||||
from flask import Response, request
|
||||
from flask import request
|
||||
from google.cloud import storage
|
||||
from spacy.tokens import Doc
|
||||
from thinc.api import Config
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
nlp = spacy.load("en_core_web_md", disable=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"])
|
||||
|
||||
def doaj_trio(request):
|
||||
try:
|
||||
user_nlp = request.data
|
||||
encoded_data = request.data
|
||||
string_data = encoded_data.decode()
|
||||
data = json.loads(string_data)
|
||||
user_nlp = nlp(data['inp'])
|
||||
|
||||
blob = request.headers.get('blob')
|
||||
|
||||
client = storage.Client()
|
||||
bucket = client.get_bucket(os.environ['bucket'])
|
||||
|
||||
config_blob = bucket.get_blob(os.environ['config'])
|
||||
config_text = config_blob.download_as_text()
|
||||
config = Config().from_str(config_text)
|
||||
|
||||
blob_object = bucket.get_blob(blob)
|
||||
journal_nlp = blob_object.download_as_bytes()
|
||||
blob_bytes = blob_object.download_as_bytes()
|
||||
blob_vocab_object = bucket.get_blob(blob + '-vocab')
|
||||
blob_vocab_bytes = blob_vocab_object.download_as_bytes()
|
||||
journal_nlp = Doc(Vocab()).from_bytes(blob_bytes)
|
||||
journal_nlp.vocab.from_bytes(blob_vocab_bytes)
|
||||
|
||||
lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
|
||||
nlp = lang_cls.from_config(config)
|
||||
user_sim = Doc(nlp.vocab).from_bytes(user_nlp)
|
||||
journal_sim = Doc(nlp.vocab).from_bytes(journal_nlp)
|
||||
|
||||
sim = user_sim.similarity(journal_sim)
|
||||
sim = user_nlp.similarity(journal_nlp)
|
||||
return str(sim)
|
||||
|
||||
except:
|
||||
raise
|
||||
|
||||
|
|
13
compare.py
13
compare.py
|
@ -3,7 +3,7 @@
|
|||
import asyncio
|
||||
import asks
|
||||
import regex
|
||||
import settings202109 as settings
|
||||
import settings202201 as settings
|
||||
import aiohttp
|
||||
import langdetect
|
||||
import os
|
||||
|
@ -94,10 +94,9 @@ def index():
|
|||
unordered_scores = {}
|
||||
inp = form.webabstract.data
|
||||
t0 = datetime.now()
|
||||
user_nlp = nlp(inp).to_bytes()
|
||||
|
||||
# do the work
|
||||
asyncio.run(parent1(user_nlp, comp))
|
||||
asyncio.run(parent1(inp, comp))
|
||||
asyncio.run(parent2(comp, unordered_scores))
|
||||
|
||||
# sort the results
|
||||
|
@ -130,15 +129,15 @@ def add_security_headers(resp):
|
|||
return resp
|
||||
|
||||
|
||||
async def parent1(user_nlp, comp):
|
||||
async def parent1(inp, comp):
|
||||
""" manage the async calls to GCP """
|
||||
await asyncio.gather(
|
||||
*[cloud_work(blob, user_nlp, comp, 0) for blob in settings.bucket_list[:10]]
|
||||
*[cloud_work(blob, inp, comp, 0) for blob in settings.bucket_list]
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
async def cloud_work(blob, user_nlp, comp, count):
|
||||
async def cloud_work(blob, inp, comp, count):
|
||||
""" interact with google cloud function """
|
||||
max_out = 0
|
||||
try:
|
||||
|
@ -146,7 +145,7 @@ async def cloud_work(blob, user_nlp, comp, count):
|
|||
while max_out < 6:
|
||||
async with session.post(
|
||||
settings.cloud_function,
|
||||
data = user_nlp,
|
||||
json = {'inp': inp},
|
||||
headers = {'blob': blob},
|
||||
) as resp:
|
||||
if max_out >= 5:
|
||||
|
|
Loading…
Reference in New Issue