Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-8695x1dy9: Changes for running predictions async in a bg process #214

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ services:
context: ./webapp
args:
SPACY_MODELS: ${SPACY_MODELS:-en_core_web_md}
image: medcattrainer-api
restart: always
volumes:
- ./webapp/api/core:/home/api/core
Expand All @@ -23,6 +24,25 @@ services:
- MCT_VERSION=latest
command: /home/scripts/run.sh

# bg process task runner
medcattrainer-bg-process:
image: medcattrainer-api
depends_on:
- medcattrainer
restart: always
volumes:
- ./webapp/api/core:/home/api/core
- ./webapp/api/api:/home/api/api
- ./webapp/scripts/run-bg-process.sh:/home/scripts/run-bg-process.sh
- ./configs:/home/configs
- api-media:/home/api/media
- api-static:/home/api/static
- api-db:/home/api/db
env_file:
- ./envs/env
command: /home/scripts/run-bg-process.sh


nginx:
image: nginx
restart: always
Expand Down
14 changes: 14 additions & 0 deletions docker-compose-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@ services:
- MCT_VERSION=v2.17.1
command: /home/scripts/run.sh

# bg process task runner
medcattrainer-bg-process:
image: cogstacksystems/medcat-trainer:v2.17.1
restart: always
volumes:
- ./configs:/home/configs
- api-media:/home/api/media
- api-static:/home/api/static
- api-db:/home/api/db
- api-db-backup:/home/api/db-backup
env_file:
- ./envs/env
command: /home/scripts/run-bg-process.sh

# crontab - for db backup
medcattrainer-db-backup:
image: cogstacksystems/medcat-trainer:v2.17.1
Expand Down
21 changes: 18 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# projects are not used.

services:
# api server
medcattrainer:
image: cogstacksystems/medcat-trainer:v2.17.1
image: cogstacksystems/medcat-trainer:v2.17.3
restart: always
volumes:
- ./configs:/home/configs
Expand All @@ -14,12 +15,26 @@ services:
env_file:
- ./envs/env
environment:
- MCT_VERSION=v2.17.1
- MCT_VERSION=v2.17.3
command: /home/scripts/run.sh

# bg process task runner
medcattrainer-bg-process:
image: cogstacksystems/medcat-trainer:v2.17.3
restart: always
volumes:
- ./configs:/home/configs
- api-media:/home/api/media
- api-static:/home/api/static
- api-db:/home/api/db
- api-db-backup:/home/api/db-backup
env_file:
- ./envs/env
command: /home/scripts/run-bg-process.sh

# crontab - for db backup
medcattrainer-db-backup:
image: cogstacksystems/medcat-trainer:v2.17.1
image: cogstacksystems/medcat-trainer:v2.17.3
restart: always
volumes:
- ./configs:/home/configs
Expand Down
2 changes: 2 additions & 0 deletions envs/env
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1

### MedCAT cfg ###
MEDCAT_CONFIG_FILE=/home/configs/base.txt
# number of MedCAT models that can be cached, run in bg processes at any one time
MAX_MEDCAT_MODELS=2

### Deployment Realm ###
ENV=non-prod
Expand Down
2 changes: 2 additions & 0 deletions envs/env-prod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1

### MedCAT cfg ###
MEDCAT_CONFIG_FILE=/home/configs/base.txt
# number of MedCAT models that can be cached, run in bg processes at any one time
MAX_MEDCAT_MODELS=2
ENV=prod

# SECRET KEY - edit this for prod deployments,
Expand Down
1 change: 1 addition & 0 deletions webapp/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ RUN for SPACY_MODEL in ${SPACY_MODELS}; do python -m spacy download ${SPACY_MODE

WORKDIR /home/api/
RUN chmod a+x /home/scripts/run.sh
RUN chmod a+x /home/scripts/run-bg-process.sh
14 changes: 14 additions & 0 deletions webapp/api/api/model_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,22 @@
VOCAB_MAP = {}
CAT_MAP = {}

_MAX_MODELS_LOADED = os.getenv("MAX_MEDCAT_MODELS", 1)

logger = logging.getLogger(__name__)


def _clear_models(cdb_map: Dict[str, CDB]=CDB_MAP,
vocab_map: Dict[str, Vocab]=VOCAB_MAP,
cat_map: Dict[str, CAT]=CAT_MAP):
if len(cat_map) == _MAX_MODELS_LOADED:
(k := next(iter(cat_map)), cat_map.pop(k))
if len(cdb_map) == _MAX_MODELS_LOADED:
(k := next(iter(cdb_map)), cdb_map.pop(k))
if len(vocab_map) == _MAX_MODELS_LOADED:
(k := next(iter(vocab_map)), vocab_map.pop(k))


def get_medcat_from_cdb_vocab(project,
cdb_map: Dict[str, CDB]=CDB_MAP,
vocab_map: Dict[str, Vocab]=VOCAB_MAP,
Expand Down Expand Up @@ -61,6 +73,7 @@ def get_medcat_from_cdb_vocab(project,
vocab_map[vocab_id] = vocab
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
cat_map[cat_id] = cat
_clear_models(cat_map=cat_map, cdb_map=cdb_map, vocab_map=vocab_map)
return cat


Expand All @@ -70,6 +83,7 @@ def get_medcat_from_model_pack(project, cat_map: Dict[str, CAT]=CAT_MAP) -> CAT:
logger.info('Loading model pack from:%s', model_pack_obj.model_pack.path)
cat = CAT.load_model_pack(model_pack_obj.model_pack.path)
cat_map[cat_id] = cat
_clear_models(cat_map=cat_map)
return cat


Expand Down
7 changes: 4 additions & 3 deletions webapp/api/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,17 +240,16 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int):
project = ProjectAnnotateEntities.objects.get(id=project_id)
docs = Document.objects.filter(id__in=doc_ids)

logger.info('Loading CAT object in bg process')
logger.info('Loading CAT object in bg process for project: %s', project.id)
cat = get_medcat(project=project)

# Set CAT filters
cat.config.linking['filters']['cuis'] = project.cuis

for doc in docs:
logger.info(f'Running MedCAT model over doc: {doc.id}')
logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}')
spacy_doc = cat(doc.text)
anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project)

add_annotations(spacy_doc=spacy_doc,
user=user,
project=project,
Expand All @@ -260,6 +259,8 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int):
# add doc to prepared_documents
project.prepared_documents.add(doc)
project.save()
logger.info('Prepared all docs for project: %s, docs processed: %s',
project.id, project.prepared_documents)


@receiver(post_save, sender=ProjectAnnotateEntities)
Expand Down
61 changes: 48 additions & 13 deletions webapp/api/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from background_task.models import Task, CompletedTask
from django.contrib.auth.views import PasswordResetView
from django.core.exceptions import ObjectDoesNotExist
from django.http import HttpResponseBadRequest, HttpResponseServerError, HttpResponse
from django.shortcuts import render
from django.utils import timezone
Expand Down Expand Up @@ -245,11 +246,6 @@ def prepare_documents(request):
'but is still set on the project. To fix remove and reset the '
'cui filter file' % project.cuis_file}, status=500)

if request.data.get('bg_task'):
# execute model infer in bg
job = prep_docs(p_id, d_ids, user.id)
return Response({'bg_job_id': job.id})

try:
for d_id in d_ids:
document = Document.objects.get(id=d_id)
Expand Down Expand Up @@ -294,24 +290,59 @@ def prepare_documents(request):
return Response({'message': 'Documents prepared successfully'})


@api_view(http_method_names=['POST'])
def prepare_documents_bg(request):
user = request.user
# Get project id
p_id = request.data['project_id']
project = ProjectAnnotateEntities.objects.get(id=p_id)
docs = Document.objects.filter(dataset=project.dataset)

# Get docs that have no AnnotatedEntities
d_ids = [d.id for d in docs if len(AnnotatedEntity.objects.filter(document=d).filter(project=project)) == 0 or
d in project.validated_documents.all()]

# execute model infer in bg
job = prep_docs(p_id, d_ids, user.id)
return Response({'bg_job_id': job.id})


@api_view(http_method_names=['GET'])
def prepare_docs_bg_tasks(request):
proj_id = int(request.GET['project'])
def prepare_docs_bg_tasks(_):
running_doc_prep_tasks = Task.objects.filter(queue='doc_prep')
completed_doc_prep_tasks = CompletedTask.objects.filter(queue='doc_prep')

def transform_task_params(task_params_str):
task_params = json.loads(task_params_str)[0]
return {
'document': task_params[1][0],
'project': task_params[0],
'user_id': task_params[2]
}
running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks
if json.loads(task.task_params)[0][0] == proj_id]
complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks
if json.loads(task.task_params)[0][0] == proj_id]
running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks]
complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks]
return Response({'running_tasks': running_tasks, 'comp_tasks': complete_tasks})


@api_view(http_method_names=['GET', 'DELETE'])
def prepare_docs_bg_task(request, proj_id):
if request.method == 'GET':
# state of bg running process as determined by prepared docs
try:
proj = ProjectAnnotateEntities.objects.get(id=proj_id)
prepd_docs_count = proj.prepared_documents.count()
ds_total_count = Document.objects.filter(dataset=ProjectAnnotateEntities.objects.get(id=proj_id).dataset.id).count()
return Response({'proj_id': proj_id, 'dataset_len': ds_total_count, 'prepd_docs_len': prepd_docs_count})
except ObjectDoesNotExist:
return HttpResponseBadRequest('No Project found for ID: %s', proj_id)
else:
running_doc_prep_tasks = {json.loads(task.task_params)[0][0]: task.id
for task in Task.objects.filter(queue='doc_prep')}
if proj_id in running_doc_prep_tasks:
Task.objects.filter(id=running_doc_prep_tasks[proj_id]).delete()
return Response("Successfully stopped running response")
else:
return HttpResponseBadRequest('Could not find running BG Process to stop')

@api_view(http_method_names=['POST'])
def add_annotation(request):
# Get project id
Expand Down Expand Up @@ -620,7 +651,11 @@ def version(_):
def concept_search_index_available(request):
cdb_ids = request.GET.get('cdbs', '').split(',')
cdb_ids = [c for c in cdb_ids if len(c)]
return collections_available(cdb_ids)
try:
return collections_available(cdb_ids)
except Exception as e:
return HttpResponseServerError("Solr Search Service not available check the service is up, running "
"and configured correctly. %s", e)


@api_view(http_method_names=['GET'])
Expand Down
7 changes: 4 additions & 3 deletions webapp/api/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', None)
trusted_origins = [] if environ_origins is None else environ_origins.split(',')
CSRF_TRUSTED_ORIGINS = ['https://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins
environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', '')
trusted_origins = [origin.strip() for origin in environ_origins.split(',') if origin.strip()]

CSRF_TRUSTED_ORIGINS = ['http://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins

SECURE_CROSS_ORIGIN_OPENER_POLICY = None

Expand Down
2 changes: 2 additions & 0 deletions webapp/api/core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
path('api/anno-conf/', api.views.get_anno_tool_conf),
path('api/search-concepts/', api.views.search_solr),
path('api/prepare-documents/', api.views.prepare_documents),
path('api/prepare-documents-bg/', api.views.prepare_documents_bg),
path('api/prep-docs-bg-tasks/', api.views.prepare_docs_bg_tasks),
path('api/prep-docs-bg-tasks/<int:proj_id>/', api.views.prepare_docs_bg_task),
path('api/api-token-auth/', auth_views.obtain_auth_token),
path('admin/', admin.site.urls),
path('api/api-auth/', include('rest_framework.urls', namespace='rest_framework')),
Expand Down
16 changes: 0 additions & 16 deletions webapp/frontend/src/components/common/DocumentSummary.vue
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,7 @@ export default {
runningBgTasks: []
}
},
created() {
// this.pollDocPrepStatus(true)
},
methods: {
pollDocPrepStatus (pollInfinite) {
if (this.projId) {
this.$http.get(`/api/prep-docs-bg-tasks/?project=${this.projId}`).then(resp => {
this.runningBgTasks = resp.data.running_tasks.map(d => d.document)
this.completeBgTasks = resp.data.comp_tasks.map(d => d.document)
})
if (pollInfinite) {
setTimeout(this.pollDocPrepStatus, 5000)
}
} else {
setTimeout(this.pollDocPrepStatus, 5000)
}
},
scrollSelectedDocId () {
const el = document.getElementsByClassName('selected-doc')
if (el.length > 0) {
Expand Down
Loading