diff --git a/backend/dataset/admin.py b/backend/dataset/admin.py index 140171494..0ecdb9a6d 100644 --- a/backend/dataset/admin.py +++ b/backend/dataset/admin.py @@ -1,4 +1,4 @@ -import resource +# import resource from django.contrib import admin from import_export.admin import ImportExportActionModelAdmin from .resources import * diff --git a/backend/dataset/migrations/0047_speechconversation_freeze_task.py b/backend/dataset/migrations/0047_speechconversation_freeze_task.py new file mode 100644 index 000000000..488cf352a --- /dev/null +++ b/backend/dataset/migrations/0047_speechconversation_freeze_task.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.14 on 2024-12-31 01:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataset', '0046_merge_20240416_2233'), + ] + + operations = [ + migrations.AddField( + model_name='speechconversation', + name='freeze_task', + field=models.BooleanField(default=False, help_text='Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.', verbose_name='freeze_task'), + ), + ] diff --git a/backend/dataset/models.py b/backend/dataset/models.py index ec16c9903..12631b7fc 100644 --- a/backend/dataset/models.py +++ b/backend/dataset/models.py @@ -1,6 +1,7 @@ """ Model definitions for Dataset Management """ + from django.db import models from users.models import User, LANG_CHOICES from organizations.models import Organization @@ -485,6 +486,12 @@ class SpeechConversation(DatasetBase): help_text=("Prepopulated prediction for the implemented models"), ) + freeze_task = models.BooleanField( + verbose_name="freeze_task", + default=False, + help_text="Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.", + ) + def __str__(self): return str(self.id) diff --git a/backend/dataset/serializers.py b/backend/dataset/serializers.py index a6152f3c2..3cd04221d 100644 --- a/backend/dataset/serializers.py +++ b/backend/dataset/serializers.py @@ -11,6 +11,21 @@ class DatasetInstanceSerializer(serializers.ModelSerializer): class Meta: model = DatasetInstance fields = "__all__" + + +class DatasetInstanceSerializerOptimized(serializers.ModelSerializer): + class Meta: + model = DatasetInstance + fields = [ + "instance_id", + "parent_instance_id", + "instance_name", + "instance_description", + "dataset_type", + "public_to_managers", + "organisation_id" + ] + class DatasetInstanceUploadSerializer(serializers.Serializer): diff --git a/backend/dataset/views.py b/backend/dataset/views.py index 7e6b4227c..fc0407ec8 100644 --- a/backend/dataset/views.py +++ b/backend/dataset/views.py @@ -38,6 +38,11 @@ from . import resources from .models import * from .serializers import * +from django.db.models import Prefetch, Q, F +from utils.dataset_utils import get_batch_dataset_upload_status +from rest_framework.response import Response +from rest_framework.decorators import action +from rest_framework import status from .tasks import upload_data_to_data_instance, deduplicate_dataset_instance_items import dataset from tasks.models import ( @@ -186,6 +191,22 @@ def get_dataset_upload_status(dataset_instance_pk): # Create your views here. +# def get_batch_dataset_upload_status(instance_ids): +# """ +# Batch fetch upload status for a list of dataset instance IDs. +# Replace this with actual logic to retrieve status from your database. +# """ +# # Mock data for testing +# status_data = {} +# for instance_id in instance_ids: +# status_data[instance_id] = { +# "last_upload_status": "Completed", +# "last_upload_date": "2023-01-01", +# "last_upload_time": "12:00:00", +# "last_upload_result": "Success", +# } +# return status_data + class DatasetInstanceViewSet(viewsets.ModelViewSet): """ ViewSet for Dataset Instance @@ -244,6 +265,8 @@ def retrieve(self, request, pk, *args, **kwargs): ), ], ) + + def list(self, request, *args, **kwargs): # Org Owners and superusers see all datasets if request.user.is_superuser: @@ -257,7 +280,6 @@ def list(self, request, *args, **kwargs): queryset = DatasetInstance.objects.filter( organisation_id=request.user.organization ).filter(Q(public_to_managers=True) | Q(users__id=request.user.id)) - if "dataset_visibility" in request.query_params: dataset_visibility = request.query_params["dataset_visibility"] if dataset_visibility == "all_public_datasets": @@ -267,18 +289,15 @@ def list(self, request, *args, **kwargs): queryset = queryset.filter(public_to_managers=True) elif dataset_visibility == "my_datasets": queryset = queryset.filter(users__id=request.user.id) - # Filter the queryset based on the query params if "dataset_type" in dict(request.query_params): queryset = queryset.filter( dataset_type__exact=request.query_params["dataset_type"] ) - # Serialize the distinct items and sort by instance ID serializer = DatasetInstanceSerializer( queryset.distinct().order_by("instance_id"), many=True ) - # Add status fields to the serializer data for dataset_instance in serializer.data: # Get the task statuses for the dataset instance @@ -288,14 +307,65 @@ def list(self, request, *args, **kwargs): dataset_instance_time, dataset_instance_result, ) = get_dataset_upload_status(dataset_instance["instance_id"]) - # Add the task status and time to the dataset instance response dataset_instance["last_upload_status"] = dataset_instance_status dataset_instance["last_upload_date"] = dataset_instance_date dataset_instance["last_upload_time"] = dataset_instance_time dataset_instance["last_upload_result"] = dataset_instance_result - return Response(serializer.data) + + + # def get_queryset(self): + @action(detail=False, methods=["get"], url_path="optimized-list") + def list_optimized(self, request): + # Base queryset determination based on user role + queryset = DatasetInstance.objects.all() + if request.user.is_superuser: + queryset = queryset + elif request.user.role == User.ORGANIZATION_OWNER: + queryset = queryset.filter( + organisation_id=request.user.organization + ) + else: + queryset = queryset.filter( + organisation_id=request.user.organization + ).filter(Q(public_to_managers=True) | Q(users__id=request.user.id)) + # Apply filters using request query parameters + dataset_visibility = request.query_params.get("dataset_visibility") + if dataset_visibility == "all_public_datasets": + queryset = queryset.filter(public_to_managers=True) + elif dataset_visibility == "my_datasets": + queryset = queryset.filter(users__id=request.user.id) + dataset_type = request.query_params.get("dataset_type") + if dataset_type: + queryset = queryset.filter(dataset_type__exact=dataset_type) + archived_datasets = request.query_params.get("archived_datasets") + if archived_datasets == "true": + queryset = queryset.filter(is_archived=True) + elif archived_datasets == "false": + queryset = queryset.filter(is_archived=False) + # Sort by criteria + sort_type = request.query_params.get("sort_type") + if sort_type == "recently_updated": + queryset = queryset.order_by(F("last_updated").desc(nulls_last=True)) + else: + queryset = queryset.order_by("instance_id") + # Optimize related field loading + queryset = queryset.prefetch_related( + Prefetch("users"), # Prefetch the related users + ) + # Serialize the data + serializer = DatasetInstanceSerializerOptimized(queryset.distinct(), many=True) + # Batch process upload status for all datasets + instance_ids = [instance["instance_id"] for instance in serializer.data] + status_data = get_batch_dataset_upload_status(instance_ids) + # Annotate upload status in the response + for dataset_instance in serializer.data: + instance_id = dataset_instance["instance_id"] + if instance_id in status_data: + dataset_instance.update(status_data[instance_id]) + return Response(serializer.data, status=status.HTTP_200_OK) + @is_organization_owner @action(methods=["GET"], detail=True, name="Download Dataset in CSV format") diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py index f6bf51261..5e74ac8d8 100644 --- a/backend/functions/tasks.py +++ b/backend/functions/tasks.py @@ -1,7 +1,10 @@ import datetime +import json import time import zipfile import threading + +import requests from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions import pandas as pd from celery import shared_task @@ -29,7 +32,7 @@ ANNOTATED, ) from tasks.views import SentenceOperationViewSet -from users.models import User, LANG_CHOICES +from users.models import User from django.core.mail import EmailMessage from utils.blob_functions import ( @@ -47,7 +50,7 @@ get_batch_asr_predictions, ) from django.db import transaction, DataError, IntegrityError -from dataset.models import DatasetInstance +from dataset.models import DatasetInstance, SpeechConversation from django.apps import apps from rest_framework.test import APIRequestFactory from django.http import QueryDict @@ -56,8 +59,13 @@ import tempfile from shoonya_backend.locks import Lock - +from utils.constants import LANG_CHOICES +from projects.tasks import filter_data_items +from projects.models import BATCH +from dataset import models as dataset_models +from projects.registry_helper import ProjectRegistry import logging +from tqdm import tqdm logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -72,6 +80,10 @@ def sentence_text_translate_and_save_translation_pairs( input_dataset_instance_id, output_dataset_instance_id, batch_size, + filter_string, + sampling_mode, + sampling_parameters, + variable_parameters, api_type="indic-trans-v2", checks_for_particular_languages=False, automate_missing_data_items=True, @@ -87,6 +99,10 @@ def sentence_text_translate_and_save_translation_pairs( Allowed - [indic-trans, google, indic-trans-v2, azure, blank] checks_for_particular_languages (bool): If True, checks for the particular languages in the translations. automate_missing_data_items (bool): If True, consider only those data items that are missing in the target dataset instance. + filter_string (str): string to filter input data. + sampling_mode (str): can be batch or full. + sampling_parameters (json): is a json that contains, batch number and batch size + """ task_name = "sentence_text_translate_and_save_translation_pairs" output_sentences = list( @@ -113,6 +129,14 @@ def sentence_text_translate_and_save_translation_pairs( "metadata_json", ) ) + if filter_string and sampling_mode and sampling_parameters: + input_sentences = get_filtered_items( + "SentenceText", + input_dataset_instance_id, + filter_string, + sampling_mode, + sampling_parameters, + ) # Convert the input_sentences list into a dataframe input_sentences_complete_df = pd.DataFrame( @@ -403,7 +427,15 @@ def conversation_data_machine_translation( @shared_task(bind=True) def generate_ocr_prediction_json( - self, dataset_instance_id, user_id, api_type, automate_missing_data_items + self, + dataset_instance_id, + user_id, + api_type, + automate_missing_data_items, + filter_string, + sampling_mode, + sampling_parameters, + variable_parameters, ): """Function to generate OCR prediction data and to save to the same data item. Args: @@ -436,7 +468,14 @@ def generate_ocr_prediction_json( ) except Exception as e: ocr_data_items = [] - + if filter_string and sampling_mode and sampling_parameters: + ocr_data_items = get_filtered_items( + "OCRDocument", + dataset_instance_id, + filter_string, + sampling_mode, + sampling_parameters, + ) # converting the dataset_instance to pandas dataframe. ocr_data_items_df = pd.DataFrame( ocr_data_items, @@ -555,7 +594,15 @@ def generate_ocr_prediction_json( @shared_task(bind=True) def generate_asr_prediction_json( - self, dataset_instance_id, user_id, api_type, automate_missing_data_items + self, + dataset_instance_id, + user_id, + api_type, + automate_missing_data_items, + filter_string, + sampling_mode, + sampling_parameters, + variable_parameters, ): """Function to generate ASR prediction data and to save to the same data item. Args: @@ -589,7 +636,14 @@ def generate_asr_prediction_json( ) except Exception as e: asr_data_items = [] - + if filter_string and sampling_mode and sampling_parameters: + asr_data_items = get_filtered_items( + "SpeechConversation", + dataset_instance_id, + filter_string, + sampling_mode, + sampling_parameters, + ) # converting the dataset_instance to pandas dataframe. asr_data_items_df = pd.DataFrame( asr_data_items, @@ -703,7 +757,16 @@ def generate_asr_prediction_json( @shared_task(bind=True) -def populate_draft_data_json(self, pk, user_id, fields_list): +def populate_draft_data_json( + self, + pk, + user_id, + fields_list, + filter_string, + sampling_mode, + sampling_parameters, + variable_parameters, +): task_name = "populate_draft_data_json" try: dataset_instance = DatasetInstance.objects.get(pk=pk) @@ -712,6 +775,10 @@ def populate_draft_data_json(self, pk, user_id, fields_list): dataset_type = dataset_instance.dataset_type dataset_model = apps.get_model("dataset", dataset_type) dataset_items = dataset_model.objects.filter(instance_id=dataset_instance) + if filter_string and sampling_mode and sampling_parameters: + dataset_items = get_filtered_items( + dataset_type, pk, filter_string, sampling_mode, sampling_parameters + ) cnt = 0 for dataset_item in dataset_items: new_draft_data_json = {} @@ -1695,3 +1762,98 @@ def upload_all_projects_to_blob_and_get_url(csv_files_directory): return "Error in generating url" blob_url = f"https://{account_name}.blob.{endpoint_suffix}/{CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS}/{blob_client.blob_name}?{sas_token}" return blob_url + + +def get_filtered_items( + dataset_model, + dataset_instance_id, + filter_string, + sampling_mode, + sampling_parameters, +): + registry_helper = ProjectRegistry.get_instance() + project_type = registry_helper.get_project_name_from_dataset(dataset_model) + if not isinstance(dataset_instance_id, list): + dataset_instance_id = [dataset_instance_id] + filtered_items = filter_data_items( + project_type=project_type, + dataset_instance_ids=dataset_instance_id, + filter_string=filter_string, + ) + # Apply sampling + if sampling_mode == BATCH: + batch_size = sampling_parameters["batch_size"] + try: + batch_number = sampling_parameters["batch_number"] + if len(batch_number) == 0: + batch_number = [1] + except KeyError: + batch_number = [1] + sampled_items = [] + for batch_num in batch_number: + sampled_items += filtered_items[ + batch_size * (batch_num - 1) : batch_size * batch_num + ] + else: + sampled_items = filtered_items + return sampled_items + + +@shared_task( + bind=True, +) +def update_SpeechConversation(self, lang, pid, auto_annotation, user_id): + UPDATE_SPEECH_CONVERSATION_API_URL = os.getenv("UPDATE_SPEECH_CONVERSATION_API_URL") + user_name = User.objects.filter(id=user_id)[0].username + data_item_list = [ + t.input_data_id + for t in Task.objects.filter(project_id=pid, task_status="incomplete") + ] + tasks_objects = Task.objects.filter(project_id=pid, task_status="incomplete") + related_tasks_ids = [task.id for task in tasks_objects] + related_annos = Annotation.objects.filter(task__id__in=related_tasks_ids) + for anno in related_annos: + anno.delete() + for task in tasks_objects: + task.delete() + data_items = SpeechConversation.objects.filter(id__in=data_item_list) + data_items_list = [] + for data_item in tqdm(data_items): + try: + MEDIA_URL = data_item.audio_url + pred_json = ( + json.loads(data_item.prediction_json) + if isinstance(data_item.prediction_json, str) + else data_item.prediction_json + ) + data = [{"audioUrl": MEDIA_URL, "audioJson": pred_json, "audioLang": lang}] + pred_text_json = requests.post( + UPDATE_SPEECH_CONVERSATION_API_URL, json=json.dumps(data) + ) + json_pred_final = json.loads(pred_text_json.text)[0] + except: + pass + setattr(data_item, "prediction_json", json_pred_final) + data_items_list.append(data_item) + SpeechConversation.objects.bulk_update(data_items_list, ["prediction_json"], 512) + + data_items_list = [] + for data_item in tqdm(data_items): + new_draft_data_json = {} + pred_json = ( + json.loads(data_item.prediction_json) + if isinstance(data_item.prediction_json, str) + else data_item.prediction_json + ) + try: + new_draft_data_json["transcribed_json"] = getattr( + data_item, "prediction_json" + ) + if new_draft_data_json["transcribed_json"] == "None": + del new_draft_data_json["transcribed_json"] + except: + pass + setattr(data_item, "draft_data_json", new_draft_data_json) + data_items_list.append(data_item) + SpeechConversation.objects.bulk_update(data_items_list, ["draft_data_json"], 512) + print(f"SpeechConversation Dataset updated for {pid} by {user_name}") diff --git a/backend/functions/urls.py b/backend/functions/urls.py index 4b3632568..ac11d9e50 100644 --- a/backend/functions/urls.py +++ b/backend/functions/urls.py @@ -29,6 +29,7 @@ ), path("schedule_project_reports_email", schedule_project_reports_email), path("download_all_projects", download_all_projects), + path("schedule_update_SpeechConversation", schedule_update_SpeechConversation), ] # urlpatterns = format_suffix_patterns(urlpatterns) diff --git a/backend/functions/views.py b/backend/functions/views.py index 09608665b..ee83c4f2f 100644 --- a/backend/functions/views.py +++ b/backend/functions/views.py @@ -14,6 +14,7 @@ ) from tasks.models import * +from utils.constants import lang_codes from .tasks import ( conversation_data_machine_translation, @@ -23,6 +24,7 @@ generate_asr_prediction_json, schedule_mail_for_project_reports, schedule_mail_to_download_all_projects, + update_SpeechConversation, ) from .utils import ( check_conversation_translation_function_inputs, @@ -274,6 +276,10 @@ def schedule_sentence_text_translate_job(request): automate_missing_data_items = request.data.get( "automate_missing_data_items", "true" ) + filter_string = request.data.get("filter_string", None) + sampling_mode = request.data.get("sampling_mode", None) + sampling_parameters = request.data.get("sampling_parameters_json", None) + variable_parameters = request.data.get("variable_parameters", None) # Convert checks for languages into boolean checks_for_particular_languages = checks_for_particular_languages.lower() == "true" @@ -311,6 +317,10 @@ def schedule_sentence_text_translate_job(request): input_dataset_instance_id=input_dataset_instance_id, output_dataset_instance_id=output_dataset_instance_id, batch_size=batch_size, + filter_string=filter_string, + sampling_mode=sampling_mode, + sampling_parameters=sampling_parameters, + variable_parameters=variable_parameters, api_type=api_type, checks_for_particular_languages=checks_for_particular_languages, automate_missing_data_items=automate_missing_data_items, @@ -537,7 +547,10 @@ def schedule_ocr_prediction_json_population(request): except KeyError: automate_missing_data_items = True - # Calling a function asynchronously to create ocr predictions. + filter_string = request.data.get("filter_string") + sampling_mode = request.data.get("sampling_mode") + sampling_parameters = request.data.get("sampling_parameters_json") + variable_parameters = request.data.get("variable_parameters") uid = request.user.id @@ -546,6 +559,10 @@ def schedule_ocr_prediction_json_population(request): user_id=uid, api_type=api_type, automate_missing_data_items=automate_missing_data_items, + filter_string=filter_string, + sampling_mode=sampling_mode, + sampling_parameters=sampling_parameters, + variable_parameters=variable_parameters, ) # Returning response @@ -574,8 +591,20 @@ def schedule_draft_data_json_population(request): pk = request.data["dataset_instance_id"] uid = request.user.id + filter_string = request.data.get("filter_string") + sampling_mode = request.data.get("sampling_mode") + sampling_parameters = request.data.get("sampling_parameters_json") + variable_parameters = request.data.get("variable_parameters") - populate_draft_data_json.delay(pk=pk, user_id=uid, fields_list=fields_list) + populate_draft_data_json( + pk=pk, + user_id=uid, + fields_list=fields_list, + filter_string=filter_string, + sampling_mode=sampling_mode, + sampling_parameters=sampling_parameters, + variable_parameters=variable_parameters, + ) ret_dict = {"message": "draft_data_json population started"} ret_status = status.HTTP_200_OK @@ -624,7 +653,10 @@ def schedule_asr_prediction_json_population(request): except KeyError: automate_missing_data_items = True - # Calling a function asynchronously to create ocr predictions. + filter_string = request.data.get("filter_string") + sampling_mode = request.data.get("sampling_mode") + sampling_parameters = request.data.get("sampling_parameters_json") + variable_parameters = request.data.get("variable_parameters") uid = request.user.id @@ -633,6 +665,10 @@ def schedule_asr_prediction_json_population(request): user_id=uid, api_type=api_type, automate_missing_data_items=automate_missing_data_items, + filter_string=filter_string, + sampling_mode=sampling_mode, + sampling_parameters=sampling_parameters, + variable_parameters=variable_parameters, ) ret_dict = {"message": "Generating ASR Predictions"} @@ -873,3 +909,33 @@ def download_all_projects(request): }, status=status.HTTP_200_OK, ) + + +@api_view(["POST"]) +def schedule_update_SpeechConversation(request): + try: + project_id = request.data["project_id"] + except Exception as e: + return Response( + {"message": "Please send a project_id"}, + status=status.HTTP_400_BAD_REQUEST, + ) + user_id = request.user.id + project = Project.objects.filter(id=project_id) + try: + tg_lang = project[0].tgt_language.lower() + model_language = lang_codes[tg_lang] + except Exception as e: + return Response( + {"message": "Error in Fetching language"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + auto_annotation = request.data.get("auto_annotation", None) + update_SpeechConversation.delay( + model_language, project_id, auto_annotation, user_id + ) + + return Response( + {"message": "Update In Progress"}, + status=status.HTTP_200_OK, + ) diff --git a/backend/organizations/views.py b/backend/organizations/views.py index 7d5e7b726..09781b624 100644 --- a/backend/organizations/views.py +++ b/backend/organizations/views.py @@ -6,6 +6,7 @@ from rest_framework import status from tasks.models import ( Task, + Statistic, ANNOTATOR_ANNOTATION, REVIEWER_ANNOTATION, SUPER_CHECKER_ANNOTATION, @@ -52,6 +53,7 @@ send_project_analytics_mail_org, send_user_analytics_mail_org, ) +from utils.filter_tasks_by_ann_type import filter_tasks_by_ann_type def get_task_count(proj_ids, status, annotator, return_count=True): @@ -460,11 +462,11 @@ def quality_reports(self, request, pk=None): participation_type = ( "Full Time" if participation_type == 1 - else "Part Time" - if participation_type == 2 - else "Contract Basis" - if participation_type == 4 - else "N/A" + else ( + "Part Time" + if participation_type == 2 + else "Contract Basis" if participation_type == 4 else "N/A" + ) ) role = get_role_name(annotator.role) user_id = annotator.id @@ -778,11 +780,11 @@ def user_analytics(self, request, pk=None): participation_type = ( "Full Time" if participation_type == 1 - else "Part Time" - if participation_type == 2 - else "Contract Basis" - if participation_type == 4 - else "N/A" + else ( + "Part Time" + if participation_type == 2 + else "Contract Basis" if participation_type == 4 else "N/A" + ) ) role = get_role_name(annotator.role) user_id = annotator.id @@ -934,9 +936,9 @@ def iter_items(items, pseudo_buffer): status=status.HTTP_200_OK, content_type="text/csv", ) - response[ - "Content-Disposition" - ] = f'attachment; filename="{organization.title}_user_analytics.csv"' + response["Content-Disposition"] = ( + f'attachment; filename="{organization.title}_user_analytics.csv"' + ) return response return Response(data=final_result, status=status.HTTP_200_OK) @@ -2713,6 +2715,7 @@ def cumulative_tasks_count(self, request, pk=None): "AudioSegmentation", "AudioTranscription", "AudioTranscriptionEditing", + "StandardisedTranscriptionEditing", "ContextualSentenceVerification", "ContextualSentenceVerificationAndDomainClassification", "ContextualTranslationEditing", @@ -2742,24 +2745,41 @@ def cumulative_tasks_count(self, request, pk=None): other_lang = [] for lang in languages: proj_lang_filter = proj_objs.filter(tgt_language=lang) - annotation_tasks_count = 0 - reviewer_task_count = 0 + annotation_tasks = Task.objects.filter( + project_id__in=proj_lang_filter, + task_status__in=[ + "annotated", + "reviewed", + "super_checked", + ], + ) reviewer_tasks = Task.objects.filter( project_id__in=proj_lang_filter, project_id__project_stage__in=[REVIEW_STAGE, SUPERCHECK_STAGE], - task_status__in=["reviewed", "exported", "super_checked"], + task_status__in=["reviewed", "super_checked"], ) - - annotation_tasks = Task.objects.filter( + supercheck_tasks = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[SUPERCHECK_STAGE], + task_status__in=["super_checked"], + ) + annotation_tasks_exported = Task.objects.filter( project_id__in=proj_lang_filter, + project_id__project_stage__in=[ANNOTATION_STAGE], task_status__in=[ - "annotated", - "reviewed", "exported", - "super_checked", ], ) - + reviewer_tasks_exported = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[REVIEW_STAGE], + task_status__in=["exported"], + ) + supercheck_tasks_exported = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[SUPERCHECK_STAGE], + task_status__in=["exported"], + ) if metainfo == True: result = {} @@ -2974,15 +2994,30 @@ def cumulative_tasks_count(self, request, pk=None): } else: - reviewer_task_count = reviewer_tasks.count() - - annotation_tasks_count = annotation_tasks.count() - - result = { - "language": lang, - "ann_cumulative_tasks_count": annotation_tasks_count, - "rew_cumulative_tasks_count": reviewer_task_count, - } + # reviewer_task_count = ( + # reviewer_tasks.count() + # + reviewer_tasks_exported.count() + # + supercheck_tasks_exported.count() + # ) + + # annotation_tasks_count = ( + # annotation_tasks.count() + # + annotation_tasks_exported.count() + # + reviewer_tasks_exported.count() + # + supercheck_tasks_exported.count() + # ) + + # supercheck_tasks_count = ( + # supercheck_tasks.count() + supercheck_tasks_exported.count() + # ) + + # result = { + # "language": lang, + # "ann_cumulative_tasks_count": annotation_tasks_count, + # "rew_cumulative_tasks_count": reviewer_task_count, + # "sup_cumulative_tasks_count": supercheck_tasks_count, + # } + result = {} if lang == None or lang == "": other_lang.append(result) @@ -2991,6 +3026,7 @@ def cumulative_tasks_count(self, request, pk=None): ann_task_count = 0 rew_task_count = 0 + sup_task_count = 0 ann_word_count = 0 rew_word_count = 0 ann_aud_dur = 0 @@ -3003,8 +3039,10 @@ def cumulative_tasks_count(self, request, pk=None): rev_sentance_count = 0 for dat in other_lang: if metainfo != True: - ann_task_count += dat["ann_cumulative_tasks_count"] - rew_task_count += dat["rew_cumulative_tasks_count"] + # ann_task_count += dat["ann_cumulative_tasks_count"] + # rew_task_count += dat["rew_cumulative_tasks_count"] + # sup_task_count += dat["sup_cumulative_tasks_count"] + pass else: if project_type in get_audio_project_types(): ann_aud_dur += convert_hours_to_seconds( @@ -3043,11 +3081,13 @@ def cumulative_tasks_count(self, request, pk=None): if len(other_lang) > 0: if metainfo != True: - other_language = { - "language": "Others", - "ann_cumulative_tasks_count": ann_task_count, - "rew_cumulative_tasks_count": rew_task_count, - } + # other_language = { + # "language": "Others", + # "ann_cumulative_tasks_count": ann_task_count, + # "rew_cumulative_tasks_count": rew_task_count, + # "sup_cumulative_tasks_count": sup_task_count, + # } + other_language = {} else: if project_type in get_audio_project_types(): other_language = { @@ -3119,4 +3159,12 @@ def cumulative_tasks_count(self, request, pk=None): pass else: final_result_for_all_types[project_type] = final_result + if metainfo != True: + + task_counts = list( + Statistic.objects.filter(stat_type="task_count", org_id=organization.id) + )[0].result + + for pjt_type in project_types: + final_result_for_all_types[pjt_type] = task_counts[pjt_type] return Response(final_result_for_all_types) diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py index 2dddab5d3..3629c8b28 100644 --- a/backend/projects/annotation_registry.py +++ b/backend/projects/annotation_registry.py @@ -108,6 +108,15 @@ "type": "labels", }, }, + "OCRTextlineSegmentation": { + "ocr_transcribed_json": { + "to_name": "image_url", + "from_name": [ + "annotation_bboxes", + ], + "type": ["rectangle"], + }, + }, "OCRTranscription": { "ocr_transcribed_json": { "to_name": "image_url", @@ -167,59 +176,6 @@ } -def convert_prediction_json_to_annotation_result( - prediction_json, speakers_json, audio_duration, index, is_acoustic=False -): - """ - Convert prediction_json and transcribed_json to annotation_result - """ - - result = [] - if prediction_json == None: - return result - - for idx, val in enumerate(prediction_json): - label_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "labels", - "original_length": audio_duration, - } - text_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "transcribed_json", - "original_length": audio_duration, - } - if is_acoustic: - text_dict["from_name"] = "verbatim_transcribed_json" - id = f"shoonya_{index}s{idx}s{generate_random_string(13-len(str(idx)))}" - label_dict["id"] = id - text_dict["id"] = id - label_dict["type"] = "labels" - text_dict["type"] = "textarea" - - value_labels = { - "start": val["start"], - "end": val["end"], - "labels": [ - next( - speaker - for speaker in speakers_json - if speaker["speaker_id"] == val["speaker_id"] - )["name"] - ], - } - value_text = {"start": val["start"], "end": val["end"], "text": [val["text"]]} - - label_dict["value"] = value_labels - text_dict["value"] = value_text - result.append(label_dict) - result.append(text_dict) - - return result - - def convert_conversation_json_to_annotation_result(conversation_json, idx): result = [] for i in range(len(conversation_json)): @@ -239,12 +195,15 @@ def convert_conversation_json_to_annotation_result(conversation_json, idx): def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None): + from projects.views import convert_prediction_json_to_annotation_result + registry_helper = ProjectRegistry.get_instance() input_dataset_info = registry_helper.get_input_dataset_and_fields(project_type) dataset_model = getattr(dataset_models, input_dataset_info["dataset_type"]) try: dataset_item = dataset_model.objects.get(pk=pk) except: + dataset_item = None pass result = [] idx = 0 @@ -263,13 +222,20 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None) if field == "conversation_json": ans = convert_conversation_json_to_annotation_result(value, idx) elif field == "transcribed_json" or field == "prediction_json": - ans = convert_prediction_json_to_annotation_result( + assert type(value) in [ + list, + dict, + ], f"Something wrong is there in the type of {value}" + if isinstance(value, list): + value = {"verbatim_transcribed_json": value} + sub_ans = convert_prediction_json_to_annotation_result( + None, + project_type, + dataset_item, value, - dataset_item.speakers_json, - dataset_item.audio_duration, - idx, - project_type == "AcousticNormalisedTranscriptionEditing", + True, ) + ans.extend(sub_ans) else: if field_type == "textarea": field_dict["value"] = {"text": [value]} diff --git a/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx new file mode 100644 index 000000000..7684b037b --- /dev/null +++ b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx @@ -0,0 +1,5 @@ + + + + + diff --git a/backend/projects/project_registry.yaml b/backend/projects/project_registry.yaml index b4c0c8d85..3c1e4f5eb 100644 --- a/backend/projects/project_registry.yaml +++ b/backend/projects/project_registry.yaml @@ -99,6 +99,24 @@ OCR: fields: annotations: - ocr_transcribed_json + OCRTextlineSegmentation: + project_mode: "Annotation" + label_studio_jsx_file: "ocr/ocr_textline_segmentation.jsx" + input_dataset: + class: OCRDocument + fields: + - image_url + - page_number + display_fields: + - image_url + - page_number + prediction: ocr_prediction_json + output_dataset: + class: OCRDocument + save_type: in_place + fields: + annotations: + - ocr_transcribed_json OCRTranscriptionEditing: project_mode: "Annotation" label_studio_jsx_file: "ocr/ocr_transcription.jsx" diff --git a/backend/projects/registry_helper.py b/backend/projects/registry_helper.py index ed1859e4c..3f8a5653a 100644 --- a/backend/projects/registry_helper.py +++ b/backend/projects/registry_helper.py @@ -253,3 +253,15 @@ def validate_registry(self): ) return True + + def get_project_name_from_dataset(self, dataset_name: str): + for project_key, project_type in self.project_types.items(): + input_dataset = project_type.get("input_dataset", {}) + output_dataset = project_type.get("output_dataset", {}) + + if ( + input_dataset.get("class") == dataset_name + or output_dataset.get("class") == dataset_name + ): + return project_key + return None diff --git a/backend/projects/utils.py b/backend/projects/utils.py index 4987ed878..c2b877007 100644 --- a/backend/projects/utils.py +++ b/backend/projects/utils.py @@ -27,7 +27,7 @@ from jiwer import wer from utils.convert_result_to_chitralekha_format import create_memory - +from dataset import models as dataset_models nltk.download("punkt") @@ -361,7 +361,10 @@ def process_speech_tasks(task, is_audio_segmentation, project_type): def process_ocr_tasks( - task, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing + task, + is_OCRSegmentCategorization, + is_OCRSegmentCategorizationEditing, + is_OCRTextlineSegmentation, ): annotation_result = process_annotation_result(task) process_ocr_results( @@ -369,6 +372,7 @@ def process_ocr_tasks( annotation_result, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing, + is_OCRTextlineSegmentation, ) @@ -451,6 +455,7 @@ def process_ocr_results( annotation_result, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing, + is_OCRTextlineSegmentation, ): from projects.views import convert_annotation_result_to_formatted_json @@ -458,10 +463,16 @@ def process_ocr_results( annotation_result, None, False, - is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing, + is_OCRSegmentCategorization + or is_OCRSegmentCategorizationEditing + or is_OCRTextlineSegmentation, False, ) - if is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing: + if ( + is_OCRSegmentCategorization + or is_OCRSegmentCategorizationEditing + or is_OCRTextlineSegmentation + ): bboxes_relation_json = [] for ann in annotation_result: if "type" in ann and ann["type"] == "relation": @@ -485,6 +496,7 @@ def process_task( include_input_data_metadata_json, dataset_model, is_audio_project_type, + fetch_parent_data_field, ): task_dict = model_to_dict(task) if export_type != "JSON": @@ -519,6 +531,21 @@ def process_task( task_dict["data"]["input_data_metadata_json"] = dataset_model.objects.get( pk=task_dict["input_data"] ).metadata_json + try: + if fetch_parent_data_field and dataset_model: + parent_data_item = dataset_model.objects.get( + pk=task_dict["input_data"] + ).parent_data + if parent_data_item: + dataset_model = getattr( + dataset_models, parent_data_item.instance_id.dataset_type + ) + parent_dataset_model = dataset_model.objects.get(pk=parent_data_item.id) + task_dict["data"]["fetch_parent_data_field"] = getattr( + parent_dataset_model, fetch_parent_data_field, None + ) + except Exception as e: + pass del task_dict["annotation_users"] del task_dict["review_user"] diff --git a/backend/projects/views.py b/backend/projects/views.py index 68e48c472..7bf98ede3 100644 --- a/backend/projects/views.py +++ b/backend/projects/views.py @@ -54,6 +54,7 @@ from .models import * from .registry_helper import ProjectRegistry from dataset import models as dataset_models +from django.db.models import Exists, OuterRef from dataset.models import ( DatasetInstance, @@ -402,6 +403,7 @@ def get_review_reports(proj_id, userid, start_date, end_date): "OCRTranscription", "OCRSegmentCategorization", "OCRSegmentCategorizationEditing", + "OCRTextlineSegmentation", ]: result["Total Word Count"] = total_word_count elif proj_type in get_audio_project_types(): @@ -650,15 +652,16 @@ def get_supercheck_reports(proj_id, userid, start_date, end_date): "OCRTranscription", "OCRSegmentCategorization", "OCRSegmentCategorizationEditing", + "OCRTextlineSegmentation", ]: result["Validated Word Count"] = validated_word_count result["Validated With Changes Word Count"] = validated_with_changes_word_count result["Rejected Word Count"] = rejected_word_count elif proj_type in get_audio_project_types(): result["Validated Segments Duration"] = validated_audio_duration - result[ - "Validated With Changes Segments Duration" - ] = validated_with_changes_audio_duration + result["Validated With Changes Segments Duration"] = ( + validated_with_changes_audio_duration + ) result["Rejected Segments Duration"] = rejected_audio_duration result["Total Raw Audio Duration"] = total_raw_audio_duration result["Average Word Error Rate R/S"] = round(avg_word_error_rate, 2) @@ -854,69 +857,152 @@ def get_task_count_unassigned(pk, user): return len(proj_tasks_unassigned) -def convert_prediction_json_to_annotation_result(pk, proj_type): +def convert_prediction_json_to_annotation_result( + pk, proj_type, data_item, prediction_json, populate_draft_data=False +): result = [] if ( proj_type == "AudioTranscriptionEditing" or proj_type == "AcousticNormalisedTranscriptionEditing" ): - data_item = SpeechConversation.objects.get(pk=pk) - prediction_json = ( - json.loads(data_item.prediction_json) - if isinstance(data_item.prediction_json, str) - else data_item.prediction_json - ) + if not data_item and not prediction_json: + data_item = SpeechConversation.objects.get(pk=pk) + prediction_json = ( + json.loads(data_item.prediction_json) + if isinstance(data_item.prediction_json, str) + else data_item.prediction_json + ) + assert type(prediction_json) in [ + dict, + list, + ], "Seems something is wrong with the formatting" + # see if the prediction is a list, then it seems that only verbatim json is present + if isinstance(prediction_json, list): + prediction_json = {"verbatim_transcribed_json": prediction_json} + speakers_json = data_item.speakers_json audio_duration = data_item.audio_duration # converting prediction_json to result (wherever it exists) for every task. if prediction_json == None: return result - for idx, val in enumerate(prediction_json): - label_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "labels", - "original_length": audio_duration, - } - text_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "transcribed_json", - "original_length": audio_duration, - } - if proj_type == "AcousticNormalisedTranscriptionEditing": - text_dict["from_name"] = "verbatim_transcribed_json" - id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}" - label_dict["id"] = id - text_dict["id"] = id - label_dict["type"] = "labels" - text_dict["type"] = "textarea" - - value_labels = { - "start": val["start"], - "end": val["end"], - "labels": [ - next( - speaker - for speaker in speakers_json - if speaker["speaker_id"] == val["speaker_id"] - )["name"] - ], - } - value_text = { - "start": val["start"], - "end": val["end"], - "text": [val["text"]], - } + # for pred_type, pred_json in prediction_json.items(): + if "acoustic_normalised_transcribed_json" in prediction_json.keys(): + for idx, (val, val_acoustic) in enumerate( + zip( + prediction_json["verbatim_transcribed_json"], + prediction_json["acoustic_normalised_transcribed_json"], + ) + ): + label_dict = { + "origin": "manual", + "to_name": "audio_url", + "from_name": "labels", + "original_length": audio_duration, + } + text_dict = { + "origin": "manual", + "to_name": "audio_url", + "from_name": "transcribed_json", + "original_length": audio_duration, + } + text_dict_acoustic = { + "origin": "manual", + "to_name": "audio_url", + "from_name": "transcribed_json", + "original_length": audio_duration, + } + if proj_type == "AcousticNormalisedTranscriptionEditing": + text_dict["from_name"] = "verbatim_transcribed_json" + text_dict_acoustic["from_name"] = ( + "acoustic_normalised_transcribed_json" + ) - label_dict["value"] = value_labels - text_dict["value"] = value_text - # mainly label_dict and text_dict are sent as result - result.append(label_dict) - result.append(text_dict) + id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}" + label_dict["id"] = id + text_dict["id"] = id + text_dict_acoustic["id"] = id + + label_dict["type"] = "labels" + text_dict["type"] = "textarea" + text_dict_acoustic["type"] = "textarea" + + value_labels = { + "start": val["start"], + "end": val["end"], + "labels": [ + next( + speaker + for speaker in speakers_json + if speaker["speaker_id"] == val["speaker_id"] + )["name"] + ], + } + value_text = { + "start": val["start"], + "end": val["end"], + "text": [val["text"]], + } + value_text_acoustic = { + "start": val_acoustic["start"], + "end": val_acoustic["end"], + "text": [val_acoustic["text"]], + } + + label_dict["value"] = value_labels + text_dict["value"] = value_text + text_dict_acoustic["value"] = value_text_acoustic + # mainly label_dict and text_dict are sent as result + result.append(label_dict) + result.append(text_dict) + result.append(text_dict_acoustic) + else: + for idx, val in enumerate(prediction_json["verbatim_transcribed_json"]): + label_dict = { + "origin": "manual", + "to_name": "audio_url", + "from_name": "labels", + "original_length": audio_duration, + } + text_dict = { + "origin": "manual", + "to_name": "audio_url", + "from_name": "transcribed_json", + "original_length": audio_duration, + } + if proj_type == "AcousticNormalisedTranscriptionEditing": + text_dict["from_name"] = "verbatim_transcribed_json" + id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}" + label_dict["id"] = id + text_dict["id"] = id + label_dict["type"] = "labels" + text_dict["type"] = "textarea" + + value_labels = { + "start": val["start"], + "end": val["end"], + "labels": [ + next( + speaker + for speaker in speakers_json + if speaker["speaker_id"] == val["speaker_id"] + )["name"] + ], + } + value_text = { + "start": val["start"], + "end": val["end"], + "text": [val["text"]], + } + + label_dict["value"] = value_labels + text_dict["value"] = value_text + # mainly label_dict and text_dict are sent as result + result.append(label_dict) + result.append(text_dict) elif ( proj_type == "OCRTranscriptionEditing" or proj_type == "OCRSegmentCategorizationEditing" + or proj_type == "OCRTextlineSegmentation" ): data_item = OCRDocument.objects.get(pk=pk) ocr_prediction_json = ( @@ -993,7 +1079,7 @@ def convert_annotation_result_to_formatted_json( annotation_result, speakers_json, is_SpeechConversation, - is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing, + is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation, is_acoustic=False, ): transcribed_json = [] @@ -1089,14 +1175,18 @@ def convert_annotation_result_to_formatted_json( acoustic_transcribed_json, ensure_ascii=False ) else: - dicts = 2 if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing else 3 + dicts = ( + 2 + if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation + else 3 + ) for idx1 in range(0, len(annotation_result), dicts): rectangle_dict = {} labels_dict = {} text_dict = {} if isinstance(annotation_result[idx1], str): annotation_result[idx1] = json.loads(annotation_result[idx1]) - if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing: + if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation: custom_text_dict = {"value": {"text": ""}} text_dict = json.dumps(custom_text_dict, indent=2) for idx2 in range(idx1, idx1 + dicts): @@ -2130,9 +2220,9 @@ def create(self, request, *args, **kwargs): if automatic_annotation_creation_mode != None: if proj.metadata_json == None: proj.metadata_json = {} - proj.metadata_json[ - "automatic_annotation_creation_mode" - ] = automatic_annotation_creation_mode + proj.metadata_json["automatic_annotation_creation_mode"] = ( + automatic_annotation_creation_mode + ) if proj.project_type == "AcousticNormalisedTranscriptionEditing": if proj.metadata_json == None: proj.metadata_json = {} @@ -2302,13 +2392,29 @@ def assign_new_tasks(self, request, pk, *args, **kwargs): annotation_status__exact=UNLABELED, completed_by=cur_user ) annotation_tasks = [anno.task.id for anno in proj_annotations] - pending_tasks = ( - Task.objects.filter(project_id=pk) - .filter(annotation_users=cur_user.id) - .filter(task_status__in=[INCOMPLETE, UNLABELED]) - .filter(id__in=annotation_tasks) - .count() - ) + if project.project_type in get_audio_project_types(): + pending_tasks = ( + Task.objects.filter(project_id=pk) + .filter(annotation_users=cur_user.id) + .filter(task_status__in=[INCOMPLETE, UNLABELED]) + .filter(id__in=annotation_tasks) + .filter( + Exists( + SpeechConversation.objects.filter( + id=OuterRef("input_data_id"), freeze_task=False + ) + ) + ) + .count() + ) + else: + pending_tasks = ( + Task.objects.filter(project_id=pk) + .filter(annotation_users=cur_user.id) + .filter(task_status__in=[INCOMPLETE, UNLABELED]) + .filter(id__in=annotation_tasks) + .count() + ) # assigned_tasks_queryset = Task.objects.filter(project_id=pk).filter(annotation_users=cur_user.id) # assigned_tasks = assigned_tasks_queryset.count() # completed_tasks = Annotation_model.objects.filter(task__in=assigned_tasks_queryset).filter(completed_by__exact=cur_user.id).count() @@ -2345,6 +2451,15 @@ def assign_new_tasks(self, request, pk, *args, **kwargs): .exclude(annotation_users=cur_user.id) .annotate(annotator_count=Count("annotation_users")) ) + if project.project_type in get_audio_project_types(): + tasks = tasks.filter( + Exists( + SpeechConversation.objects.filter( + id=OuterRef("input_data_id"), freeze_task=False + ) + ) + ) + tasks = tasks.filter( annotator_count__lt=project.required_annotators_per_task ).distinct() @@ -2365,10 +2480,11 @@ def assign_new_tasks(self, request, pk, *args, **kwargs): "AudioTranscriptionEditing", "OCRTranscriptionEditing", "OCRSegmentCategorizationEditing", + "OCRTextlineSegmentation", ]: try: result = convert_prediction_json_to_annotation_result( - task.input_data.id, project.project_type + task.input_data.id, project.project_type, None, None, False ) except Exception as e: print( @@ -2611,6 +2727,7 @@ def assign_new_review_tasks(self, request, pk, *args, **kwargs): except Exception as e: continue # check if the project contains eligible tasks to pull + tasks = ( Task.objects.filter(project_id=pk) .filter(task_status=ANNOTATED) @@ -2618,6 +2735,16 @@ def assign_new_review_tasks(self, request, pk, *args, **kwargs): .exclude(annotation_users=cur_user.id) .distinct() ) + + if project.project_type in get_audio_project_types(): + tasks = tasks.filter( + Exists( + SpeechConversation.objects.filter( + id=OuterRef("input_data_id"), freeze_task=False + ) + ) + ) + if not tasks: project.release_lock(REVIEW_LOCK) return Response( @@ -2820,6 +2947,16 @@ def assign_new_supercheck_tasks(self, request, pk, *args, **kwargs): .exclude(review_user=cur_user.id) .distinct() ) + + if project.project_type in get_audio_project_types(): + tasks = tasks.filter( + Exists( + SpeechConversation.objects.filter( + id=OuterRef("input_data_id"), freeze_task=False + ) + ) + ) + if not tasks: project.release_lock(SUPERCHECK_LOCK) return Response( @@ -4048,7 +4185,9 @@ def download(self, request, pk=None, *args, **kwargs): try: project = Project.objects.get(pk=pk) project_type = dict(PROJECT_TYPE_CHOICES)[project.project_type] - + fetch_parent_data_field = request.query_params.get( + "fetch_parent_data_field", None + ) include_input_data_metadata_json = request.query_params.get( "include_input_data_metadata_json", False ) @@ -4090,6 +4229,7 @@ def download(self, request, pk=None, *args, **kwargs): is_OCRSegmentCategorizationEditing = ( project_type == "OCRSegmentCategorizationEditing" ) + is_OCRTextlineSegmentation = project_type == "OCRTextlineSegmentation" is_OCRSegmentCategorization = project_type == "OCRSegmentCategorization" for task in tasks: try: @@ -4099,6 +4239,7 @@ def download(self, request, pk=None, *args, **kwargs): include_input_data_metadata_json, dataset_model, is_audio_project_type, + fetch_parent_data_field, ) if ( is_ConversationTranslation @@ -4121,6 +4262,7 @@ def download(self, request, pk=None, *args, **kwargs): curr_task, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing, + is_OCRTextlineSegmentation, ) except Exception as e: continue diff --git a/backend/shoonya_backend/celery.py b/backend/shoonya_backend/celery.py index b75b761e3..ed741f516 100644 --- a/backend/shoonya_backend/celery.py +++ b/backend/shoonya_backend/celery.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, unicode_literals from datetime import timedelta from celery.schedules import crontab +from celery.signals import worker_ready import os from celery import Celery @@ -38,8 +39,24 @@ "task": "check_size", "schedule": crontab(minute=0, hour=0), # every mid night }, + "fetchTaskCounts": { + "task": "fetchTaskCounts", + "schedule": crontab(minute=0, hour="*/1"), + }, + "fetchWorkspaceTaskCounts": { + "task": "fetchWorkspaceTaskCounts", + "schedule": crontab(minute=0, hour="*/1"), + }, } + +@worker_ready.connect +def at_start(sender, **k): + with sender.app.connection() as conn: + sender.app.send_task("fetchTaskCounts", connection=conn) + sender.app.send_task("fetchWorkspaceTaskCounts", connection=conn) + + # Celery Task related settings celery_app.autodiscover_tasks() diff --git a/backend/shoonya_backend/settings.py b/backend/shoonya_backend/settings.py index 2915ce894..281194cb8 100644 --- a/backend/shoonya_backend/settings.py +++ b/backend/shoonya_backend/settings.py @@ -42,6 +42,7 @@ "0.0.0.0", "backend.shoonya.ai4bharat.org", "backend.shoonya2.ai4bharat.org", + "127.0.0.1" ] # Application definition diff --git a/backend/tasks/models.py b/backend/tasks/models.py index 7fac09c7a..75f8639dd 100644 --- a/backend/tasks/models.py +++ b/backend/tasks/models.py @@ -271,6 +271,16 @@ class Meta: ) +class Statistic(models.Model): + stat_type = models.CharField(max_length=255) + org_id = models.IntegerField() + result = models.JSONField() + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + unique_together = ("stat_type", "org_id") + + class Prediction(models.Model): """ML predictions""" diff --git a/backend/tasks/urls.py b/backend/tasks/urls.py index d4f229e6f..1df41e6cf 100644 --- a/backend/tasks/urls.py +++ b/backend/tasks/urls.py @@ -6,6 +6,7 @@ AnnotationViewSet, PredictionViewSet, get_celery_tasks, + TransliterationAPIView, ) router = routers.DefaultRouter() @@ -15,4 +16,9 @@ urlpatterns = [ path("get_celery_tasks/", get_celery_tasks), + path( + "xlit-api/generic/transliteration//", + TransliterationAPIView.as_view(), + name="transliteration-api", + ), ] + router.urls diff --git a/backend/tasks/views.py b/backend/tasks/views.py index 43ebf9339..0e889d6f1 100644 --- a/backend/tasks/views.py +++ b/backend/tasks/views.py @@ -53,6 +53,7 @@ import sacrebleu from utils.date_time_conversions import utc_to_ist +from rest_framework.views import APIView # Create your views here. @@ -1753,13 +1754,17 @@ def partial_update(self, request, pk=None): == "AcousticNormalisedTranscriptionEditing" else False ) - is_ocr_sc_or_sce = ( + is_ocr_sc_or_sce_or_ts = ( True if annotation_obj.task.project_id.project_type - in ["OCRSegmentCategorization", "OCRSegmentCategorizationEditing"] + in [ + "OCRSegmentCategorization", + "OCRSegmentCategorizationEditing", + "OCRTextlineSegmentation", + ] else False ) - if is_ocr_sc_or_sce and ( + if is_ocr_sc_or_sce_or_ts and ( "language" in request.data or "ocr_domain" in request.data ): language = request.data.get("languages", []) @@ -1785,11 +1790,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - == 1, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + == 1 + ), ) else: annotation_obj.result = request.data["result"] @@ -1841,11 +1850,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - == 1, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + == 1 + ), ) annotation_status = request.data["annotation_status"] if empty_flag == True and annotation_status in [ @@ -1914,11 +1927,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - <= 2, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + <= 2 + ), ) else: annotation_obj.result = request.data["result"] @@ -2009,11 +2026,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - <= 2, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + <= 2 + ), ) annotation_status = request.data["annotation_status"] if empty_flag == True and annotation_status in [ @@ -2109,11 +2130,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - <= 3, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + <= 3 + ), ) else: annotation_obj.result = request.data["result"] @@ -2195,11 +2220,15 @@ def partial_update(self, request, pk=None): request.data["result"], annotation_obj.task, is_acoustic_project_type, - is_acoustic_project_type - and annotation_obj.task.project_id.metadata_json[ - "acoustic_enabled_stage" - ] - <= 3, + ( + is_acoustic_project_type + and "acoustic_enabled_stage" + in annotation_obj.task.project_id.metadata_json + and annotation_obj.task.project_id.metadata_json[ + "acoustic_enabled_stage" + ] + <= 3 + ), ) if empty_flag == True and annotation_status in [ LABELED, @@ -2642,3 +2671,16 @@ def get_celery_tasks(request): page_size = int(request.GET.get("page_size", 10)) data = paginate_queryset(filtered_tasks, page_number, page_size) return JsonResponse(data["results"], safe=False) + + +class TransliterationAPIView(APIView): + permission_classes = [IsAuthenticated] + + def get(self, request, target_language, data, *args, **kwargs): + response_transliteration = requests.get( + os.getenv("TRANSLITERATION_URL") + target_language + "/" + data, + headers={"Authorization": "Bearer " + os.getenv("TRANSLITERATION_KEY")}, + ) + + transliteration_output = response_transliteration.json() + return Response(transliteration_output, status=status.HTTP_200_OK) diff --git a/backend/user_reports.py b/backend/user_reports.py index 41ef1f7a0..72ece4961 100644 --- a/backend/user_reports.py +++ b/backend/user_reports.py @@ -18,6 +18,32 @@ from django.conf import settings from pretty_html_table import build_table import numpy as np +from tasks.models import Statistic +from django.db import connection + + +def checkNoneValue(value): + if value == None: + return "0.0" + return value + + +def checkLangNone(language): + if language == None: + return "Others" + return language + + +def upsert_stat(stat_type, org_id, result): + obj, created = Statistic.objects.update_or_create( + stat_type=stat_type, + org_id=org_id, + defaults={ + "result": result, + }, + ) + + return obj, created def calculate_reports(): @@ -330,3 +356,553 @@ def calculate_reports(): [user.email], html_message=email_to_send, ) + + +def fetch_task_counts(): + org_ids = [1, 2, 3] + project_types = [ + "AcousticNormalisedTranscriptionEditing", + "AudioSegmentation", + "AudioTranscription", + "AudioTranscriptionEditing", + "StandardisedTranscriptionEditing", + "ContextualSentenceVerification", + "ContextualSentenceVerificationAndDomainClassification", + "ContextualTranslationEditing", + "ConversationTranslation", + "ConversationTranslationEditing", + "ConversationVerification", + "MonolingualTranslation", + "OCRTranscriptionEditing", + "SemanticTextualSimilarity_Scale5", + "SentenceSplitting", + "TranslationEditing", + "OCRSegmentCategorizationEditing", + ] + + with connection.cursor() as cursor: + + for org in org_ids: + + final_result_for_all__types = {} + + for pjt_type in project_types: + + sql_query = f""" + with annotation_tasks (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('annotated','reviewed','super_checked') + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ),reviewer_tasks (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('reviewed','super_checked') + AND pjt.project_stage in (2,3) + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ) + ,superchecker_tasks (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('super_checked') + AND pjt.project_stage in (3) + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ), + annotation_tasks_exported (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('exported') + AND pjt.project_stage in (1) + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ), reviewer_tasks_exported (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('exported') + AND pjt.project_stage in (2) + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ), supercheck_tasks_exported (language,count) as + ( + SELECT + pjt.tgt_language, + count(tsk.id) + FROM + tasks_task AS tsk, + projects_project AS pjt + WHERE + tsk.project_id_id = pjt.id + AND tsk.task_status in ('exported') + AND pjt.project_stage in (3) + AND pjt.project_type in ('{pjt_type}') + AND pjt.organization_id_id = {org} + GROUP BY + pjt.tgt_language + ), + reviewer_tasks_count (language,count,tag) as ( + SELECT + language, + SUM(count) as task_count, + 'rew' + FROM ( + SELECT language, count FROM reviewer_tasks + UNION ALL + SELECT language, count FROM reviewer_tasks_exported + UNION ALL + SELECT language, count FROM supercheck_tasks_exported + ) AS merged_tables + GROUP BY language + ), + annotation_tasks_count (language,count,tag) as ( + SELECT + language, + SUM(count) as task_count, + 'ann' + FROM ( + SELECT language, count FROM annotation_tasks + UNION ALL + SELECT language, count FROM annotation_tasks_exported + UNION ALL + SELECT language, count FROM reviewer_tasks_exported + UNION ALL + SELECT language, count FROM supercheck_tasks_exported + ) AS merged_tables + GROUP BY language + ), + supercheck_tasks_count (language,count,tag) as ( + SELECT + language, + SUM(count) as task_count, + 'sup' + FROM ( + SELECT language, count FROM superchecker_tasks + UNION ALL + SELECT language, count FROM supercheck_tasks_exported + ) AS merged_tables + GROUP BY language + ), + cumulative_task_counts (language,count,tag) as ( + select language,count,tag from annotation_tasks_count + union all + select language,count,tag from reviewer_tasks_count + union all + select language,count,tag from supercheck_tasks_count + ) + SELECT + language, + SUM(CASE WHEN tag = 'ann' THEN count ELSE 0 END) AS annotation_count, + SUM(CASE WHEN tag = 'rew' THEN count ELSE 0 END) AS reviewer_count, + SUM(CASE WHEN tag = 'sup' THEN count ELSE 0 END) AS superchecker_count + FROM cumulative_task_counts + GROUP BY language; + """ + cursor.execute(sql=sql_query) + result = cursor.fetchall() + formatted_result = [] + for langResult in result: + ann, rev, sup = langResult[1:] + formatted_result.append( + { + "language": checkLangNone(langResult[0]), + "ann_cumulative_tasks_count": int(str(ann)), + "rew_cumulative_tasks_count": int(str(rev)), + "sup_cumulative_tasks_count": int(str(sup)), + } + ) + final_result_for_all__types[pjt_type] = formatted_result + upsert_stat("task_count", org, final_result_for_all__types) + + +def fetch_workspace_task_counts(): + + sql_query = """ + WITH + ANNOTATION_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('annotated', 'reviewed', 'super_checked') + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + REVIEWER_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('reviewed', 'super_checked') + AND PJT.PROJECT_STAGE IN (2, 3) + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + SUPERCHECKER_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('super_checked') + AND PJT.PROJECT_STAGE IN (3) + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + ANNOTATION_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('exported') + AND PJT.PROJECT_STAGE IN (1) + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + REVIEWER_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('exported') + AND PJT.PROJECT_STAGE IN (2) + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + SUPERCHECK_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS ( + SELECT + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID, + COUNT(TSK.ID) + FROM + TASKS_TASK AS TSK, + PROJECTS_PROJECT AS PJT + WHERE + TSK.PROJECT_ID_ID = PJT.ID + AND TSK.TASK_STATUS IN ('exported') + AND PJT.PROJECT_STAGE IN (3) + GROUP BY + PJT.TGT_LANGUAGE, + PJT.PROJECT_TYPE, + PJT.WORKSPACE_ID_ID + ), + REVIEWER_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + SUM(COUNT) AS TASK_COUNT, + 'rew' + FROM + ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + REVIEWER_TASKS + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + REVIEWER_TASKS_EXPORTED + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + SUPERCHECK_TASKS_EXPORTED + ) AS MERGED_TABLES + GROUP BY + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID + ), + ANNOTATION_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + SUM(COUNT) AS TASK_COUNT, + 'ann' + FROM + ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + ANNOTATION_TASKS + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + ANNOTATION_TASKS_EXPORTED + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + REVIEWER_TASKS_EXPORTED + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + SUPERCHECK_TASKS_EXPORTED + ) AS MERGED_TABLES + GROUP BY + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID + ), + SUPERCHECK_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + SUM(COUNT) AS TASK_COUNT, + 'sup' + FROM + ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + SUPERCHECKER_TASKS + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT + FROM + SUPERCHECK_TASKS_EXPORTED + ) AS MERGED_TABLES + GROUP BY + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID + ), + CUMULATIVE_TASK_COUNTS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS ( + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT, + TAG + FROM + ANNOTATION_TASKS_COUNT + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT, + TAG + FROM + REVIEWER_TASKS_COUNT + UNION ALL + SELECT + LANGUAGE, + PROJECT_TYPE, + WORKSPACE_ID, + COUNT, + TAG + FROM + SUPERCHECK_TASKS_COUNT + ), + WORKSPACE_COUNTS ( + WORKSPACE_ID, + LANGUAGE, + PROJECT_TYPE, + ANNOTATION_COUNT, + REVIEWER_COUNT, + SUPERCHECKER_COUNT + ) AS ( + SELECT + CTC.WORKSPACE_ID, + COALESCE(CTC.LANGUAGE, 'Others'), + CTC.PROJECT_TYPE, + SUM( + CASE + WHEN TAG = 'ann' THEN CTC.COUNT + ELSE 0 + END + ) AS ANNOTATION_COUNT, + SUM( + CASE + WHEN TAG = 'rew' THEN CTC.COUNT + ELSE 0 + END + ) AS REVIEWER_COUNT, + SUM( + CASE + WHEN TAG = 'sup' THEN CTC.COUNT + ELSE 0 + END + ) AS SUPERCHECKER_COUNT + FROM + CUMULATIVE_TASK_COUNTS AS CTC + JOIN WORKSPACES_WORKSPACE AS WSP ON WSP.ID = CTC.WORKSPACE_ID + GROUP BY + CTC.LANGUAGE, + CTC.PROJECT_TYPE, + CTC.WORKSPACE_ID, + WSP.WORKSPACE_NAME + ), + AGGREGATED_DATA (WORKSPACE_ID, PROJECT_TYPE, PROJECT_DATA) AS ( + SELECT + WORKSPACE_ID, + PROJECT_TYPE, + JSON_AGG( + JSON_BUILD_OBJECT( + 'language', + LANGUAGE, + 'ann_cumulative_tasks_count', + ANNOTATION_COUNT, + 'rew_cumulative_tasks_count', + REVIEWER_COUNT, + 'sup_cumulative_tasks_count', + SUPERCHECKER_COUNT + ) + ) AS PROJECT_DATA + FROM + WORKSPACE_COUNTS + GROUP BY + PROJECT_TYPE, + WORKSPACE_ID + ), + WORKSPACE_TASK_COUNTS (WORKSPACE_ID, ORGANIZATION_ID, RESULT) AS ( + SELECT + ADT.WORKSPACE_ID, + WSP.ORGANIZATION_ID, + JSON_OBJECT_AGG(ADT.PROJECT_TYPE, ADT.PROJECT_DATA) AS RESULT + FROM + AGGREGATED_DATA AS ADT + JOIN WORKSPACES_WORKSPACE AS WSP ON WSP.ID = ADT.WORKSPACE_ID + GROUP BY + ADT.WORKSPACE_ID, + WSP.ORGANIZATION_ID + ) +SELECT + ORGANIZATION_ID, + JSONB_OBJECT_AGG(WORKSPACE_ID, RESULT) as workspace_task_counts +FROM + WORKSPACE_TASK_COUNTS +GROUP BY + ORGANIZATION_ID + """ + with connection.cursor() as cursor: + + cursor.execute(sql=sql_query) + result = cursor.fetchall() + for org_id, workspace_task_counts in result: + workspace_task_counts = json.loads(workspace_task_counts) + upsert_stat( + stat_type="workspace_task_counts", + org_id=org_id, + result=workspace_task_counts, + ) diff --git a/backend/users/tasks.py b/backend/users/tasks.py index b83601409..cc2ef62ec 100644 --- a/backend/users/tasks.py +++ b/backend/users/tasks.py @@ -3,9 +3,28 @@ from django.core.mail import send_mail from celery.schedules import crontab from shoonya_backend.celery import celery_app -from user_reports import calculate_reports +from user_reports import ( + calculate_reports, + fetch_task_counts, + fetch_workspace_task_counts, +) +from celery.utils.log import get_task_logger + +logger = get_task_logger(__name__) @shared_task(name="send_mail_task") def send_mail_task(): calculate_reports() + + +@shared_task(name="fetchTaskCounts") +def fetchTaskCounts(): + fetch_task_counts() + logger.info("Completed Task Count Update") + + +@shared_task(name="fetchWorkspaceTaskCounts") +def fetchWorkspaceTaskCounts(): + fetch_workspace_task_counts() + logger.info("Completed Workspace Task Count Update") diff --git a/backend/utils/constants.py b/backend/utils/constants.py new file mode 100644 index 000000000..69e6bf258 --- /dev/null +++ b/backend/utils/constants.py @@ -0,0 +1,51 @@ +LANG_CHOICES = ( + ("English", "English"), + ("Assamese", "Assamese"), + ("Bengali", "Bengali"), + ("Bodo", "Bodo"), + ("Dogri", "Dogri"), + ("Gujarati", "Gujarati"), + ("Hindi", "Hindi"), + ("Kannada", "Kannada"), + ("Kashmiri", "Kashmiri"), + ("Konkani", "Konkani"), + ("Maithili", "Maithili"), + ("Malayalam", "Malayalam"), + ("Manipuri", "Manipuri"), + ("Marathi", "Marathi"), + ("Nepali", "Nepali"), + ("Odia", "Odia"), + ("Punjabi", "Punjabi"), + ("Sanskrit", "Sanskrit"), + ("Santali", "Santali"), + ("Sindhi", "Sindhi"), + ("Sinhala", "Sinhala"), + ("Tamil", "Tamil"), + ("Telugu", "Telugu"), + ("Urdu", "Urdu"), +) + +lang_codes = { + "assamese": "as", + "bengali": "bn", + "bodo": "brx", + "dogri": "doi", + "gujarati": "gu", + "hindi": "hi", + "kannada": "kn", + "kashmiri": "ks", + "konkani": "kok", + "maithili": "mai", + "malayalam": "ml", + "manipuri": "mni", + "marathi": "mr", + "nepali": "ne", + "odia": "or", + "punjabi": "pa", + "sanskrit": "sa", + "santali": "sat", + "sindhi": "sd", + "tamil": "ta", + "telugu": "te", + "urdu": "ur", +} diff --git a/backend/utils/dataset_utils.py b/backend/utils/dataset_utils.py new file mode 100644 index 000000000..0cf950fec --- /dev/null +++ b/backend/utils/dataset_utils.py @@ -0,0 +1,15 @@ +def get_batch_dataset_upload_status(instance_ids): + """ + Batch fetch upload status for a list of dataset instance IDs. + Replace this with actual logic to retrieve status from your database. + """ + # Mock data for testing + status_data = {} + for instance_id in instance_ids: + status_data[instance_id] = { + "last_upload_status": "Completed", + "last_upload_date": "2023-01-01", + "last_upload_time": "12:00:00", + "last_upload_result": "Success", + } + return status_data \ No newline at end of file diff --git a/backend/utils/filter_tasks_by_ann_type.py b/backend/utils/filter_tasks_by_ann_type.py new file mode 100644 index 000000000..3fb641293 --- /dev/null +++ b/backend/utils/filter_tasks_by_ann_type.py @@ -0,0 +1,47 @@ +from tasks.models import ( + Annotation, + ANNOTATOR_ANNOTATION, + REVIEWER_ANNOTATION, + SUPER_CHECKER_ANNOTATION, + LABELED, + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + VALIDATED, + VALIDATED_WITH_CHANGES, +) + + +def filter_tasks_by_ann_type(annotation_tasks, reviewer_tasks, supercheck_tasks): + filtered_annotation_tasks, filtered_reviewer_tasks, filtered_supercheck_tasks = ( + [], + [], + [], + ) + for a in annotation_tasks: + anno = Annotation.objects.filter( + task=a, annotation_type=ANNOTATOR_ANNOTATION, annotation_status=LABELED + )[0] + if anno: + filtered_annotation_tasks.append(a) + for r in reviewer_tasks: + anno = Annotation.objects.filter( + task=r, + annotation_type=REVIEWER_ANNOTATION, + annotation_status__in=[ + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + ], + )[0] + if anno: + filtered_reviewer_tasks.append(r) + for s in supercheck_tasks: + anno = Annotation.objects.filter( + task=s, + annotation_type=SUPER_CHECKER_ANNOTATION, + annotation_status__in=[VALIDATED, VALIDATED_WITH_CHANGES], + )[0] + if anno: + filtered_supercheck_tasks.append(s) + return filtered_annotation_tasks, filtered_reviewer_tasks, filtered_supercheck_tasks diff --git a/backend/workspaces/views.py b/backend/workspaces/views.py index e5cd136d9..2318b1866 100644 --- a/backend/workspaces/views.py +++ b/backend/workspaces/views.py @@ -8,7 +8,7 @@ from projects.models import Project, ANNOTATION_STAGE, REVIEW_STAGE, SUPERCHECK_STAGE from users.models import User from users.serializers import UserProfileSerializer -from tasks.models import Task +from tasks.models import Task, Statistic from organizations.models import Organization from django.db.models import Q from projects.utils import no_of_words @@ -61,7 +61,7 @@ get_review_reports, get_supercheck_reports, ) - +from utils.filter_tasks_by_ann_type import filter_tasks_by_ann_type # Create your views here. @@ -1404,23 +1404,41 @@ def cumulative_tasks_count_all(self, request, pk=None): other_lang = [] for lang in languages: proj_lang_filter = proj_objs.filter(tgt_language=lang) - annotation_tasks_count = 0 - reviewer_task_count = 0 + annotation_tasks = Task.objects.filter( + project_id__in=proj_lang_filter, + task_status__in=[ + "annotated", + "reviewed", + "super_checked", + ], + ) reviewer_tasks = Task.objects.filter( project_id__in=proj_lang_filter, project_id__project_stage__in=[REVIEW_STAGE, SUPERCHECK_STAGE], - task_status__in=["reviewed", "exported", "super_checked"], + task_status__in=["reviewed", "super_checked"], ) - - annotation_tasks = Task.objects.filter( + supercheck_tasks = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[SUPERCHECK_STAGE], + task_status__in=["super_checked"], + ) + annotation_tasks_exported = Task.objects.filter( project_id__in=proj_lang_filter, + project_id__project_stage__in=[ANNOTATION_STAGE], task_status__in=[ - "annotated", - "reviewed", "exported", - "super_checked", ], ) + reviewer_tasks_exported = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[REVIEW_STAGE], + task_status__in=["exported"], + ) + supercheck_tasks_exported = Task.objects.filter( + project_id__in=proj_lang_filter, + project_id__project_stage__in=[SUPERCHECK_STAGE], + task_status__in=["exported"], + ) if metainfo == True: result = {} @@ -1636,15 +1654,30 @@ def cumulative_tasks_count_all(self, request, pk=None): } else: - reviewer_task_count = reviewer_tasks.count() - - annotation_tasks_count = annotation_tasks.count() - - result = { - "language": lang, - "ann_cumulative_tasks_count": annotation_tasks_count, - "rew_cumulative_tasks_count": reviewer_task_count, - } + # reviewer_task_count = ( + # reviewer_tasks.count() + # + reviewer_tasks_exported.count() + # + supercheck_tasks_exported.count() + # ) + + # annotation_tasks_count = ( + # annotation_tasks.count() + # + annotation_tasks_exported.count() + # + reviewer_tasks_exported.count() + # + supercheck_tasks_exported.count() + # ) + + # supercheck_tasks_count = ( + # supercheck_tasks.count() + supercheck_tasks_exported.count() + # ) + + # result = { + # "language": lang, + # "ann_cumulative_tasks_count": annotation_tasks_count, + # "rew_cumulative_tasks_count": reviewer_task_count, + # "sup_cumulative_tasks_count": supercheck_tasks_count, + # } + result = {} if lang == None or lang == "": other_lang.append(result) @@ -1653,6 +1686,7 @@ def cumulative_tasks_count_all(self, request, pk=None): ann_task_count = 0 rew_task_count = 0 + sup_task_count = 0 ann_word_count = 0 rew_word_count = 0 ann_aud_dur = 0 @@ -1665,8 +1699,10 @@ def cumulative_tasks_count_all(self, request, pk=None): rev_sentance_count = 0 for dat in other_lang: if metainfo != True: - ann_task_count += dat["ann_cumulative_tasks_count"] - rew_task_count += dat["rew_cumulative_tasks_count"] + # ann_task_count += dat["ann_cumulative_tasks_count"] + # rew_task_count += dat["rew_cumulative_tasks_count"] + # sup_task_count += dat["sup_cumulative_tasks_count"] + pass else: if project_type in get_audio_project_types(): ann_aud_dur += convert_hours_to_seconds( @@ -1706,11 +1742,13 @@ def cumulative_tasks_count_all(self, request, pk=None): if len(other_lang) > 0: if metainfo != True: - other_language = { - "language": "Others", - "ann_cumulative_tasks_count": ann_task_count, - "rew_cumulative_tasks_count": rew_task_count, - } + other_language = {} + # other_language = { + # "language": "Others", + # "ann_cumulative_tasks_count": ann_task_count, + # "rew_cumulative_tasks_count": rew_task_count, + # "sup_cumulative_tasks_count": sup_task_count, + # } else: if project_type in get_audio_project_types(): other_language = { @@ -1782,6 +1820,18 @@ def cumulative_tasks_count_all(self, request, pk=None): pass else: final_result_for_all_types[project_type] = final_result + + if metainfo != True: + workspace = Workspace.objects.get(pk=pk) + print(workspace.organization_id) + task_counts = list( + Statistic.objects.filter( + stat_type="workspace_task_counts", org_id=workspace.organization_id + ) + )[0].result + + for pjt_type in project_types: + final_result_for_all_types[pjt_type] = task_counts[str(pk)][pjt_type] return Response(final_result_for_all_types) @action(