diff --git a/backend/dataset/admin.py b/backend/dataset/admin.py
index 140171494..0ecdb9a6d 100644
--- a/backend/dataset/admin.py
+++ b/backend/dataset/admin.py
@@ -1,4 +1,4 @@
-import resource
+# import resource
from django.contrib import admin
from import_export.admin import ImportExportActionModelAdmin
from .resources import *
diff --git a/backend/dataset/migrations/0047_speechconversation_freeze_task.py b/backend/dataset/migrations/0047_speechconversation_freeze_task.py
new file mode 100644
index 000000000..488cf352a
--- /dev/null
+++ b/backend/dataset/migrations/0047_speechconversation_freeze_task.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.14 on 2024-12-31 01:54
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('dataset', '0046_merge_20240416_2233'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='speechconversation',
+ name='freeze_task',
+ field=models.BooleanField(default=False, help_text='Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.', verbose_name='freeze_task'),
+ ),
+ ]
diff --git a/backend/dataset/models.py b/backend/dataset/models.py
index ec16c9903..12631b7fc 100644
--- a/backend/dataset/models.py
+++ b/backend/dataset/models.py
@@ -1,6 +1,7 @@
"""
Model definitions for Dataset Management
"""
+
from django.db import models
from users.models import User, LANG_CHOICES
from organizations.models import Organization
@@ -485,6 +486,12 @@ class SpeechConversation(DatasetBase):
help_text=("Prepopulated prediction for the implemented models"),
)
+ freeze_task = models.BooleanField(
+ verbose_name="freeze_task",
+ default=False,
+ help_text="Field to Indicate whether the current task is frozen by the administrator to prevent being annotated.",
+ )
+
def __str__(self):
return str(self.id)
diff --git a/backend/dataset/serializers.py b/backend/dataset/serializers.py
index a6152f3c2..3cd04221d 100644
--- a/backend/dataset/serializers.py
+++ b/backend/dataset/serializers.py
@@ -11,6 +11,21 @@ class DatasetInstanceSerializer(serializers.ModelSerializer):
class Meta:
model = DatasetInstance
fields = "__all__"
+
+
+class DatasetInstanceSerializerOptimized(serializers.ModelSerializer):
+ class Meta:
+ model = DatasetInstance
+ fields = [
+ "instance_id",
+ "parent_instance_id",
+ "instance_name",
+ "instance_description",
+ "dataset_type",
+ "public_to_managers",
+ "organisation_id"
+ ]
+
class DatasetInstanceUploadSerializer(serializers.Serializer):
diff --git a/backend/dataset/views.py b/backend/dataset/views.py
index 7e6b4227c..fc0407ec8 100644
--- a/backend/dataset/views.py
+++ b/backend/dataset/views.py
@@ -38,6 +38,11 @@
from . import resources
from .models import *
from .serializers import *
+from django.db.models import Prefetch, Q, F
+from utils.dataset_utils import get_batch_dataset_upload_status
+from rest_framework.response import Response
+from rest_framework.decorators import action
+from rest_framework import status
from .tasks import upload_data_to_data_instance, deduplicate_dataset_instance_items
import dataset
from tasks.models import (
@@ -186,6 +191,22 @@ def get_dataset_upload_status(dataset_instance_pk):
# Create your views here.
+# def get_batch_dataset_upload_status(instance_ids):
+# """
+# Batch fetch upload status for a list of dataset instance IDs.
+# Replace this with actual logic to retrieve status from your database.
+# """
+# # Mock data for testing
+# status_data = {}
+# for instance_id in instance_ids:
+# status_data[instance_id] = {
+# "last_upload_status": "Completed",
+# "last_upload_date": "2023-01-01",
+# "last_upload_time": "12:00:00",
+# "last_upload_result": "Success",
+# }
+# return status_data
+
class DatasetInstanceViewSet(viewsets.ModelViewSet):
"""
ViewSet for Dataset Instance
@@ -244,6 +265,8 @@ def retrieve(self, request, pk, *args, **kwargs):
),
],
)
+
+
def list(self, request, *args, **kwargs):
# Org Owners and superusers see all datasets
if request.user.is_superuser:
@@ -257,7 +280,6 @@ def list(self, request, *args, **kwargs):
queryset = DatasetInstance.objects.filter(
organisation_id=request.user.organization
).filter(Q(public_to_managers=True) | Q(users__id=request.user.id))
-
if "dataset_visibility" in request.query_params:
dataset_visibility = request.query_params["dataset_visibility"]
if dataset_visibility == "all_public_datasets":
@@ -267,18 +289,15 @@ def list(self, request, *args, **kwargs):
queryset = queryset.filter(public_to_managers=True)
elif dataset_visibility == "my_datasets":
queryset = queryset.filter(users__id=request.user.id)
-
# Filter the queryset based on the query params
if "dataset_type" in dict(request.query_params):
queryset = queryset.filter(
dataset_type__exact=request.query_params["dataset_type"]
)
-
# Serialize the distinct items and sort by instance ID
serializer = DatasetInstanceSerializer(
queryset.distinct().order_by("instance_id"), many=True
)
-
# Add status fields to the serializer data
for dataset_instance in serializer.data:
# Get the task statuses for the dataset instance
@@ -288,14 +307,65 @@ def list(self, request, *args, **kwargs):
dataset_instance_time,
dataset_instance_result,
) = get_dataset_upload_status(dataset_instance["instance_id"])
-
# Add the task status and time to the dataset instance response
dataset_instance["last_upload_status"] = dataset_instance_status
dataset_instance["last_upload_date"] = dataset_instance_date
dataset_instance["last_upload_time"] = dataset_instance_time
dataset_instance["last_upload_result"] = dataset_instance_result
-
return Response(serializer.data)
+
+
+ # def get_queryset(self):
+ @action(detail=False, methods=["get"], url_path="optimized-list")
+ def list_optimized(self, request):
+ # Base queryset determination based on user role
+ queryset = DatasetInstance.objects.all()
+ if request.user.is_superuser:
+ queryset = queryset
+ elif request.user.role == User.ORGANIZATION_OWNER:
+ queryset = queryset.filter(
+ organisation_id=request.user.organization
+ )
+ else:
+ queryset = queryset.filter(
+ organisation_id=request.user.organization
+ ).filter(Q(public_to_managers=True) | Q(users__id=request.user.id))
+ # Apply filters using request query parameters
+ dataset_visibility = request.query_params.get("dataset_visibility")
+ if dataset_visibility == "all_public_datasets":
+ queryset = queryset.filter(public_to_managers=True)
+ elif dataset_visibility == "my_datasets":
+ queryset = queryset.filter(users__id=request.user.id)
+ dataset_type = request.query_params.get("dataset_type")
+ if dataset_type:
+ queryset = queryset.filter(dataset_type__exact=dataset_type)
+ archived_datasets = request.query_params.get("archived_datasets")
+ if archived_datasets == "true":
+ queryset = queryset.filter(is_archived=True)
+ elif archived_datasets == "false":
+ queryset = queryset.filter(is_archived=False)
+ # Sort by criteria
+ sort_type = request.query_params.get("sort_type")
+ if sort_type == "recently_updated":
+ queryset = queryset.order_by(F("last_updated").desc(nulls_last=True))
+ else:
+ queryset = queryset.order_by("instance_id")
+ # Optimize related field loading
+ queryset = queryset.prefetch_related(
+ Prefetch("users"), # Prefetch the related users
+ )
+ # Serialize the data
+ serializer = DatasetInstanceSerializerOptimized(queryset.distinct(), many=True)
+ # Batch process upload status for all datasets
+ instance_ids = [instance["instance_id"] for instance in serializer.data]
+ status_data = get_batch_dataset_upload_status(instance_ids)
+ # Annotate upload status in the response
+ for dataset_instance in serializer.data:
+ instance_id = dataset_instance["instance_id"]
+ if instance_id in status_data:
+ dataset_instance.update(status_data[instance_id])
+ return Response(serializer.data, status=status.HTTP_200_OK)
+
@is_organization_owner
@action(methods=["GET"], detail=True, name="Download Dataset in CSV format")
diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py
index f6bf51261..5e74ac8d8 100644
--- a/backend/functions/tasks.py
+++ b/backend/functions/tasks.py
@@ -1,7 +1,10 @@
import datetime
+import json
import time
import zipfile
import threading
+
+import requests
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
import pandas as pd
from celery import shared_task
@@ -29,7 +32,7 @@
ANNOTATED,
)
from tasks.views import SentenceOperationViewSet
-from users.models import User, LANG_CHOICES
+from users.models import User
from django.core.mail import EmailMessage
from utils.blob_functions import (
@@ -47,7 +50,7 @@
get_batch_asr_predictions,
)
from django.db import transaction, DataError, IntegrityError
-from dataset.models import DatasetInstance
+from dataset.models import DatasetInstance, SpeechConversation
from django.apps import apps
from rest_framework.test import APIRequestFactory
from django.http import QueryDict
@@ -56,8 +59,13 @@
import tempfile
from shoonya_backend.locks import Lock
-
+from utils.constants import LANG_CHOICES
+from projects.tasks import filter_data_items
+from projects.models import BATCH
+from dataset import models as dataset_models
+from projects.registry_helper import ProjectRegistry
import logging
+from tqdm import tqdm
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@@ -72,6 +80,10 @@ def sentence_text_translate_and_save_translation_pairs(
input_dataset_instance_id,
output_dataset_instance_id,
batch_size,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ variable_parameters,
api_type="indic-trans-v2",
checks_for_particular_languages=False,
automate_missing_data_items=True,
@@ -87,6 +99,10 @@ def sentence_text_translate_and_save_translation_pairs(
Allowed - [indic-trans, google, indic-trans-v2, azure, blank]
checks_for_particular_languages (bool): If True, checks for the particular languages in the translations.
automate_missing_data_items (bool): If True, consider only those data items that are missing in the target dataset instance.
+ filter_string (str): string to filter input data.
+ sampling_mode (str): can be batch or full.
+ sampling_parameters (json): is a json that contains, batch number and batch size
+
"""
task_name = "sentence_text_translate_and_save_translation_pairs"
output_sentences = list(
@@ -113,6 +129,14 @@ def sentence_text_translate_and_save_translation_pairs(
"metadata_json",
)
)
+ if filter_string and sampling_mode and sampling_parameters:
+ input_sentences = get_filtered_items(
+ "SentenceText",
+ input_dataset_instance_id,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ )
# Convert the input_sentences list into a dataframe
input_sentences_complete_df = pd.DataFrame(
@@ -403,7 +427,15 @@ def conversation_data_machine_translation(
@shared_task(bind=True)
def generate_ocr_prediction_json(
- self, dataset_instance_id, user_id, api_type, automate_missing_data_items
+ self,
+ dataset_instance_id,
+ user_id,
+ api_type,
+ automate_missing_data_items,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ variable_parameters,
):
"""Function to generate OCR prediction data and to save to the same data item.
Args:
@@ -436,7 +468,14 @@ def generate_ocr_prediction_json(
)
except Exception as e:
ocr_data_items = []
-
+ if filter_string and sampling_mode and sampling_parameters:
+ ocr_data_items = get_filtered_items(
+ "OCRDocument",
+ dataset_instance_id,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ )
# converting the dataset_instance to pandas dataframe.
ocr_data_items_df = pd.DataFrame(
ocr_data_items,
@@ -555,7 +594,15 @@ def generate_ocr_prediction_json(
@shared_task(bind=True)
def generate_asr_prediction_json(
- self, dataset_instance_id, user_id, api_type, automate_missing_data_items
+ self,
+ dataset_instance_id,
+ user_id,
+ api_type,
+ automate_missing_data_items,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ variable_parameters,
):
"""Function to generate ASR prediction data and to save to the same data item.
Args:
@@ -589,7 +636,14 @@ def generate_asr_prediction_json(
)
except Exception as e:
asr_data_items = []
-
+ if filter_string and sampling_mode and sampling_parameters:
+ asr_data_items = get_filtered_items(
+ "SpeechConversation",
+ dataset_instance_id,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ )
# converting the dataset_instance to pandas dataframe.
asr_data_items_df = pd.DataFrame(
asr_data_items,
@@ -703,7 +757,16 @@ def generate_asr_prediction_json(
@shared_task(bind=True)
-def populate_draft_data_json(self, pk, user_id, fields_list):
+def populate_draft_data_json(
+ self,
+ pk,
+ user_id,
+ fields_list,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+ variable_parameters,
+):
task_name = "populate_draft_data_json"
try:
dataset_instance = DatasetInstance.objects.get(pk=pk)
@@ -712,6 +775,10 @@ def populate_draft_data_json(self, pk, user_id, fields_list):
dataset_type = dataset_instance.dataset_type
dataset_model = apps.get_model("dataset", dataset_type)
dataset_items = dataset_model.objects.filter(instance_id=dataset_instance)
+ if filter_string and sampling_mode and sampling_parameters:
+ dataset_items = get_filtered_items(
+ dataset_type, pk, filter_string, sampling_mode, sampling_parameters
+ )
cnt = 0
for dataset_item in dataset_items:
new_draft_data_json = {}
@@ -1695,3 +1762,98 @@ def upload_all_projects_to_blob_and_get_url(csv_files_directory):
return "Error in generating url"
blob_url = f"https://{account_name}.blob.{endpoint_suffix}/{CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS}/{blob_client.blob_name}?{sas_token}"
return blob_url
+
+
+def get_filtered_items(
+ dataset_model,
+ dataset_instance_id,
+ filter_string,
+ sampling_mode,
+ sampling_parameters,
+):
+ registry_helper = ProjectRegistry.get_instance()
+ project_type = registry_helper.get_project_name_from_dataset(dataset_model)
+ if not isinstance(dataset_instance_id, list):
+ dataset_instance_id = [dataset_instance_id]
+ filtered_items = filter_data_items(
+ project_type=project_type,
+ dataset_instance_ids=dataset_instance_id,
+ filter_string=filter_string,
+ )
+ # Apply sampling
+ if sampling_mode == BATCH:
+ batch_size = sampling_parameters["batch_size"]
+ try:
+ batch_number = sampling_parameters["batch_number"]
+ if len(batch_number) == 0:
+ batch_number = [1]
+ except KeyError:
+ batch_number = [1]
+ sampled_items = []
+ for batch_num in batch_number:
+ sampled_items += filtered_items[
+ batch_size * (batch_num - 1) : batch_size * batch_num
+ ]
+ else:
+ sampled_items = filtered_items
+ return sampled_items
+
+
+@shared_task(
+ bind=True,
+)
+def update_SpeechConversation(self, lang, pid, auto_annotation, user_id):
+ UPDATE_SPEECH_CONVERSATION_API_URL = os.getenv("UPDATE_SPEECH_CONVERSATION_API_URL")
+ user_name = User.objects.filter(id=user_id)[0].username
+ data_item_list = [
+ t.input_data_id
+ for t in Task.objects.filter(project_id=pid, task_status="incomplete")
+ ]
+ tasks_objects = Task.objects.filter(project_id=pid, task_status="incomplete")
+ related_tasks_ids = [task.id for task in tasks_objects]
+ related_annos = Annotation.objects.filter(task__id__in=related_tasks_ids)
+ for anno in related_annos:
+ anno.delete()
+ for task in tasks_objects:
+ task.delete()
+ data_items = SpeechConversation.objects.filter(id__in=data_item_list)
+ data_items_list = []
+ for data_item in tqdm(data_items):
+ try:
+ MEDIA_URL = data_item.audio_url
+ pred_json = (
+ json.loads(data_item.prediction_json)
+ if isinstance(data_item.prediction_json, str)
+ else data_item.prediction_json
+ )
+ data = [{"audioUrl": MEDIA_URL, "audioJson": pred_json, "audioLang": lang}]
+ pred_text_json = requests.post(
+ UPDATE_SPEECH_CONVERSATION_API_URL, json=json.dumps(data)
+ )
+ json_pred_final = json.loads(pred_text_json.text)[0]
+ except:
+ pass
+ setattr(data_item, "prediction_json", json_pred_final)
+ data_items_list.append(data_item)
+ SpeechConversation.objects.bulk_update(data_items_list, ["prediction_json"], 512)
+
+ data_items_list = []
+ for data_item in tqdm(data_items):
+ new_draft_data_json = {}
+ pred_json = (
+ json.loads(data_item.prediction_json)
+ if isinstance(data_item.prediction_json, str)
+ else data_item.prediction_json
+ )
+ try:
+ new_draft_data_json["transcribed_json"] = getattr(
+ data_item, "prediction_json"
+ )
+ if new_draft_data_json["transcribed_json"] == "None":
+ del new_draft_data_json["transcribed_json"]
+ except:
+ pass
+ setattr(data_item, "draft_data_json", new_draft_data_json)
+ data_items_list.append(data_item)
+ SpeechConversation.objects.bulk_update(data_items_list, ["draft_data_json"], 512)
+ print(f"SpeechConversation Dataset updated for {pid} by {user_name}")
diff --git a/backend/functions/urls.py b/backend/functions/urls.py
index 4b3632568..ac11d9e50 100644
--- a/backend/functions/urls.py
+++ b/backend/functions/urls.py
@@ -29,6 +29,7 @@
),
path("schedule_project_reports_email", schedule_project_reports_email),
path("download_all_projects", download_all_projects),
+ path("schedule_update_SpeechConversation", schedule_update_SpeechConversation),
]
# urlpatterns = format_suffix_patterns(urlpatterns)
diff --git a/backend/functions/views.py b/backend/functions/views.py
index 09608665b..ee83c4f2f 100644
--- a/backend/functions/views.py
+++ b/backend/functions/views.py
@@ -14,6 +14,7 @@
)
from tasks.models import *
+from utils.constants import lang_codes
from .tasks import (
conversation_data_machine_translation,
@@ -23,6 +24,7 @@
generate_asr_prediction_json,
schedule_mail_for_project_reports,
schedule_mail_to_download_all_projects,
+ update_SpeechConversation,
)
from .utils import (
check_conversation_translation_function_inputs,
@@ -274,6 +276,10 @@ def schedule_sentence_text_translate_job(request):
automate_missing_data_items = request.data.get(
"automate_missing_data_items", "true"
)
+ filter_string = request.data.get("filter_string", None)
+ sampling_mode = request.data.get("sampling_mode", None)
+ sampling_parameters = request.data.get("sampling_parameters_json", None)
+ variable_parameters = request.data.get("variable_parameters", None)
# Convert checks for languages into boolean
checks_for_particular_languages = checks_for_particular_languages.lower() == "true"
@@ -311,6 +317,10 @@ def schedule_sentence_text_translate_job(request):
input_dataset_instance_id=input_dataset_instance_id,
output_dataset_instance_id=output_dataset_instance_id,
batch_size=batch_size,
+ filter_string=filter_string,
+ sampling_mode=sampling_mode,
+ sampling_parameters=sampling_parameters,
+ variable_parameters=variable_parameters,
api_type=api_type,
checks_for_particular_languages=checks_for_particular_languages,
automate_missing_data_items=automate_missing_data_items,
@@ -537,7 +547,10 @@ def schedule_ocr_prediction_json_population(request):
except KeyError:
automate_missing_data_items = True
- # Calling a function asynchronously to create ocr predictions.
+ filter_string = request.data.get("filter_string")
+ sampling_mode = request.data.get("sampling_mode")
+ sampling_parameters = request.data.get("sampling_parameters_json")
+ variable_parameters = request.data.get("variable_parameters")
uid = request.user.id
@@ -546,6 +559,10 @@ def schedule_ocr_prediction_json_population(request):
user_id=uid,
api_type=api_type,
automate_missing_data_items=automate_missing_data_items,
+ filter_string=filter_string,
+ sampling_mode=sampling_mode,
+ sampling_parameters=sampling_parameters,
+ variable_parameters=variable_parameters,
)
# Returning response
@@ -574,8 +591,20 @@ def schedule_draft_data_json_population(request):
pk = request.data["dataset_instance_id"]
uid = request.user.id
+ filter_string = request.data.get("filter_string")
+ sampling_mode = request.data.get("sampling_mode")
+ sampling_parameters = request.data.get("sampling_parameters_json")
+ variable_parameters = request.data.get("variable_parameters")
- populate_draft_data_json.delay(pk=pk, user_id=uid, fields_list=fields_list)
+ populate_draft_data_json(
+ pk=pk,
+ user_id=uid,
+ fields_list=fields_list,
+ filter_string=filter_string,
+ sampling_mode=sampling_mode,
+ sampling_parameters=sampling_parameters,
+ variable_parameters=variable_parameters,
+ )
ret_dict = {"message": "draft_data_json population started"}
ret_status = status.HTTP_200_OK
@@ -624,7 +653,10 @@ def schedule_asr_prediction_json_population(request):
except KeyError:
automate_missing_data_items = True
- # Calling a function asynchronously to create ocr predictions.
+ filter_string = request.data.get("filter_string")
+ sampling_mode = request.data.get("sampling_mode")
+ sampling_parameters = request.data.get("sampling_parameters_json")
+ variable_parameters = request.data.get("variable_parameters")
uid = request.user.id
@@ -633,6 +665,10 @@ def schedule_asr_prediction_json_population(request):
user_id=uid,
api_type=api_type,
automate_missing_data_items=automate_missing_data_items,
+ filter_string=filter_string,
+ sampling_mode=sampling_mode,
+ sampling_parameters=sampling_parameters,
+ variable_parameters=variable_parameters,
)
ret_dict = {"message": "Generating ASR Predictions"}
@@ -873,3 +909,33 @@ def download_all_projects(request):
},
status=status.HTTP_200_OK,
)
+
+
+@api_view(["POST"])
+def schedule_update_SpeechConversation(request):
+ try:
+ project_id = request.data["project_id"]
+ except Exception as e:
+ return Response(
+ {"message": "Please send a project_id"},
+ status=status.HTTP_400_BAD_REQUEST,
+ )
+ user_id = request.user.id
+ project = Project.objects.filter(id=project_id)
+ try:
+ tg_lang = project[0].tgt_language.lower()
+ model_language = lang_codes[tg_lang]
+ except Exception as e:
+ return Response(
+ {"message": "Error in Fetching language"},
+ status=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ )
+ auto_annotation = request.data.get("auto_annotation", None)
+ update_SpeechConversation.delay(
+ model_language, project_id, auto_annotation, user_id
+ )
+
+ return Response(
+ {"message": "Update In Progress"},
+ status=status.HTTP_200_OK,
+ )
diff --git a/backend/organizations/views.py b/backend/organizations/views.py
index 7d5e7b726..09781b624 100644
--- a/backend/organizations/views.py
+++ b/backend/organizations/views.py
@@ -6,6 +6,7 @@
from rest_framework import status
from tasks.models import (
Task,
+ Statistic,
ANNOTATOR_ANNOTATION,
REVIEWER_ANNOTATION,
SUPER_CHECKER_ANNOTATION,
@@ -52,6 +53,7 @@
send_project_analytics_mail_org,
send_user_analytics_mail_org,
)
+from utils.filter_tasks_by_ann_type import filter_tasks_by_ann_type
def get_task_count(proj_ids, status, annotator, return_count=True):
@@ -460,11 +462,11 @@ def quality_reports(self, request, pk=None):
participation_type = (
"Full Time"
if participation_type == 1
- else "Part Time"
- if participation_type == 2
- else "Contract Basis"
- if participation_type == 4
- else "N/A"
+ else (
+ "Part Time"
+ if participation_type == 2
+ else "Contract Basis" if participation_type == 4 else "N/A"
+ )
)
role = get_role_name(annotator.role)
user_id = annotator.id
@@ -778,11 +780,11 @@ def user_analytics(self, request, pk=None):
participation_type = (
"Full Time"
if participation_type == 1
- else "Part Time"
- if participation_type == 2
- else "Contract Basis"
- if participation_type == 4
- else "N/A"
+ else (
+ "Part Time"
+ if participation_type == 2
+ else "Contract Basis" if participation_type == 4 else "N/A"
+ )
)
role = get_role_name(annotator.role)
user_id = annotator.id
@@ -934,9 +936,9 @@ def iter_items(items, pseudo_buffer):
status=status.HTTP_200_OK,
content_type="text/csv",
)
- response[
- "Content-Disposition"
- ] = f'attachment; filename="{organization.title}_user_analytics.csv"'
+ response["Content-Disposition"] = (
+ f'attachment; filename="{organization.title}_user_analytics.csv"'
+ )
return response
return Response(data=final_result, status=status.HTTP_200_OK)
@@ -2713,6 +2715,7 @@ def cumulative_tasks_count(self, request, pk=None):
"AudioSegmentation",
"AudioTranscription",
"AudioTranscriptionEditing",
+ "StandardisedTranscriptionEditing",
"ContextualSentenceVerification",
"ContextualSentenceVerificationAndDomainClassification",
"ContextualTranslationEditing",
@@ -2742,24 +2745,41 @@ def cumulative_tasks_count(self, request, pk=None):
other_lang = []
for lang in languages:
proj_lang_filter = proj_objs.filter(tgt_language=lang)
- annotation_tasks_count = 0
- reviewer_task_count = 0
+ annotation_tasks = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ task_status__in=[
+ "annotated",
+ "reviewed",
+ "super_checked",
+ ],
+ )
reviewer_tasks = Task.objects.filter(
project_id__in=proj_lang_filter,
project_id__project_stage__in=[REVIEW_STAGE, SUPERCHECK_STAGE],
- task_status__in=["reviewed", "exported", "super_checked"],
+ task_status__in=["reviewed", "super_checked"],
)
-
- annotation_tasks = Task.objects.filter(
+ supercheck_tasks = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[SUPERCHECK_STAGE],
+ task_status__in=["super_checked"],
+ )
+ annotation_tasks_exported = Task.objects.filter(
project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[ANNOTATION_STAGE],
task_status__in=[
- "annotated",
- "reviewed",
"exported",
- "super_checked",
],
)
-
+ reviewer_tasks_exported = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[REVIEW_STAGE],
+ task_status__in=["exported"],
+ )
+ supercheck_tasks_exported = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[SUPERCHECK_STAGE],
+ task_status__in=["exported"],
+ )
if metainfo == True:
result = {}
@@ -2974,15 +2994,30 @@ def cumulative_tasks_count(self, request, pk=None):
}
else:
- reviewer_task_count = reviewer_tasks.count()
-
- annotation_tasks_count = annotation_tasks.count()
-
- result = {
- "language": lang,
- "ann_cumulative_tasks_count": annotation_tasks_count,
- "rew_cumulative_tasks_count": reviewer_task_count,
- }
+ # reviewer_task_count = (
+ # reviewer_tasks.count()
+ # + reviewer_tasks_exported.count()
+ # + supercheck_tasks_exported.count()
+ # )
+
+ # annotation_tasks_count = (
+ # annotation_tasks.count()
+ # + annotation_tasks_exported.count()
+ # + reviewer_tasks_exported.count()
+ # + supercheck_tasks_exported.count()
+ # )
+
+ # supercheck_tasks_count = (
+ # supercheck_tasks.count() + supercheck_tasks_exported.count()
+ # )
+
+ # result = {
+ # "language": lang,
+ # "ann_cumulative_tasks_count": annotation_tasks_count,
+ # "rew_cumulative_tasks_count": reviewer_task_count,
+ # "sup_cumulative_tasks_count": supercheck_tasks_count,
+ # }
+ result = {}
if lang == None or lang == "":
other_lang.append(result)
@@ -2991,6 +3026,7 @@ def cumulative_tasks_count(self, request, pk=None):
ann_task_count = 0
rew_task_count = 0
+ sup_task_count = 0
ann_word_count = 0
rew_word_count = 0
ann_aud_dur = 0
@@ -3003,8 +3039,10 @@ def cumulative_tasks_count(self, request, pk=None):
rev_sentance_count = 0
for dat in other_lang:
if metainfo != True:
- ann_task_count += dat["ann_cumulative_tasks_count"]
- rew_task_count += dat["rew_cumulative_tasks_count"]
+ # ann_task_count += dat["ann_cumulative_tasks_count"]
+ # rew_task_count += dat["rew_cumulative_tasks_count"]
+ # sup_task_count += dat["sup_cumulative_tasks_count"]
+ pass
else:
if project_type in get_audio_project_types():
ann_aud_dur += convert_hours_to_seconds(
@@ -3043,11 +3081,13 @@ def cumulative_tasks_count(self, request, pk=None):
if len(other_lang) > 0:
if metainfo != True:
- other_language = {
- "language": "Others",
- "ann_cumulative_tasks_count": ann_task_count,
- "rew_cumulative_tasks_count": rew_task_count,
- }
+ # other_language = {
+ # "language": "Others",
+ # "ann_cumulative_tasks_count": ann_task_count,
+ # "rew_cumulative_tasks_count": rew_task_count,
+ # "sup_cumulative_tasks_count": sup_task_count,
+ # }
+ other_language = {}
else:
if project_type in get_audio_project_types():
other_language = {
@@ -3119,4 +3159,12 @@ def cumulative_tasks_count(self, request, pk=None):
pass
else:
final_result_for_all_types[project_type] = final_result
+ if metainfo != True:
+
+ task_counts = list(
+ Statistic.objects.filter(stat_type="task_count", org_id=organization.id)
+ )[0].result
+
+ for pjt_type in project_types:
+ final_result_for_all_types[pjt_type] = task_counts[pjt_type]
return Response(final_result_for_all_types)
diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 2dddab5d3..3629c8b28 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -108,6 +108,15 @@
"type": "labels",
},
},
+ "OCRTextlineSegmentation": {
+ "ocr_transcribed_json": {
+ "to_name": "image_url",
+ "from_name": [
+ "annotation_bboxes",
+ ],
+ "type": ["rectangle"],
+ },
+ },
"OCRTranscription": {
"ocr_transcribed_json": {
"to_name": "image_url",
@@ -167,59 +176,6 @@
}
-def convert_prediction_json_to_annotation_result(
- prediction_json, speakers_json, audio_duration, index, is_acoustic=False
-):
- """
- Convert prediction_json and transcribed_json to annotation_result
- """
-
- result = []
- if prediction_json == None:
- return result
-
- for idx, val in enumerate(prediction_json):
- label_dict = {
- "origin": "manual",
- "to_name": "audio_url",
- "from_name": "labels",
- "original_length": audio_duration,
- }
- text_dict = {
- "origin": "manual",
- "to_name": "audio_url",
- "from_name": "transcribed_json",
- "original_length": audio_duration,
- }
- if is_acoustic:
- text_dict["from_name"] = "verbatim_transcribed_json"
- id = f"shoonya_{index}s{idx}s{generate_random_string(13-len(str(idx)))}"
- label_dict["id"] = id
- text_dict["id"] = id
- label_dict["type"] = "labels"
- text_dict["type"] = "textarea"
-
- value_labels = {
- "start": val["start"],
- "end": val["end"],
- "labels": [
- next(
- speaker
- for speaker in speakers_json
- if speaker["speaker_id"] == val["speaker_id"]
- )["name"]
- ],
- }
- value_text = {"start": val["start"], "end": val["end"], "text": [val["text"]]}
-
- label_dict["value"] = value_labels
- text_dict["value"] = value_text
- result.append(label_dict)
- result.append(text_dict)
-
- return result
-
-
def convert_conversation_json_to_annotation_result(conversation_json, idx):
result = []
for i in range(len(conversation_json)):
@@ -239,12 +195,15 @@ def convert_conversation_json_to_annotation_result(conversation_json, idx):
def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None):
+ from projects.views import convert_prediction_json_to_annotation_result
+
registry_helper = ProjectRegistry.get_instance()
input_dataset_info = registry_helper.get_input_dataset_and_fields(project_type)
dataset_model = getattr(dataset_models, input_dataset_info["dataset_type"])
try:
dataset_item = dataset_model.objects.get(pk=pk)
except:
+ dataset_item = None
pass
result = []
idx = 0
@@ -263,13 +222,20 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None)
if field == "conversation_json":
ans = convert_conversation_json_to_annotation_result(value, idx)
elif field == "transcribed_json" or field == "prediction_json":
- ans = convert_prediction_json_to_annotation_result(
+ assert type(value) in [
+ list,
+ dict,
+ ], f"Something wrong is there in the type of {value}"
+ if isinstance(value, list):
+ value = {"verbatim_transcribed_json": value}
+ sub_ans = convert_prediction_json_to_annotation_result(
+ None,
+ project_type,
+ dataset_item,
value,
- dataset_item.speakers_json,
- dataset_item.audio_duration,
- idx,
- project_type == "AcousticNormalisedTranscriptionEditing",
+ True,
)
+ ans.extend(sub_ans)
else:
if field_type == "textarea":
field_dict["value"] = {"text": [value]}
diff --git a/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx
new file mode 100644
index 000000000..7684b037b
--- /dev/null
+++ b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/backend/projects/project_registry.yaml b/backend/projects/project_registry.yaml
index b4c0c8d85..3c1e4f5eb 100644
--- a/backend/projects/project_registry.yaml
+++ b/backend/projects/project_registry.yaml
@@ -99,6 +99,24 @@ OCR:
fields:
annotations:
- ocr_transcribed_json
+ OCRTextlineSegmentation:
+ project_mode: "Annotation"
+ label_studio_jsx_file: "ocr/ocr_textline_segmentation.jsx"
+ input_dataset:
+ class: OCRDocument
+ fields:
+ - image_url
+ - page_number
+ display_fields:
+ - image_url
+ - page_number
+ prediction: ocr_prediction_json
+ output_dataset:
+ class: OCRDocument
+ save_type: in_place
+ fields:
+ annotations:
+ - ocr_transcribed_json
OCRTranscriptionEditing:
project_mode: "Annotation"
label_studio_jsx_file: "ocr/ocr_transcription.jsx"
diff --git a/backend/projects/registry_helper.py b/backend/projects/registry_helper.py
index ed1859e4c..3f8a5653a 100644
--- a/backend/projects/registry_helper.py
+++ b/backend/projects/registry_helper.py
@@ -253,3 +253,15 @@ def validate_registry(self):
)
return True
+
+ def get_project_name_from_dataset(self, dataset_name: str):
+ for project_key, project_type in self.project_types.items():
+ input_dataset = project_type.get("input_dataset", {})
+ output_dataset = project_type.get("output_dataset", {})
+
+ if (
+ input_dataset.get("class") == dataset_name
+ or output_dataset.get("class") == dataset_name
+ ):
+ return project_key
+ return None
diff --git a/backend/projects/utils.py b/backend/projects/utils.py
index 4987ed878..c2b877007 100644
--- a/backend/projects/utils.py
+++ b/backend/projects/utils.py
@@ -27,7 +27,7 @@
from jiwer import wer
from utils.convert_result_to_chitralekha_format import create_memory
-
+from dataset import models as dataset_models
nltk.download("punkt")
@@ -361,7 +361,10 @@ def process_speech_tasks(task, is_audio_segmentation, project_type):
def process_ocr_tasks(
- task, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing
+ task,
+ is_OCRSegmentCategorization,
+ is_OCRSegmentCategorizationEditing,
+ is_OCRTextlineSegmentation,
):
annotation_result = process_annotation_result(task)
process_ocr_results(
@@ -369,6 +372,7 @@ def process_ocr_tasks(
annotation_result,
is_OCRSegmentCategorization,
is_OCRSegmentCategorizationEditing,
+ is_OCRTextlineSegmentation,
)
@@ -451,6 +455,7 @@ def process_ocr_results(
annotation_result,
is_OCRSegmentCategorization,
is_OCRSegmentCategorizationEditing,
+ is_OCRTextlineSegmentation,
):
from projects.views import convert_annotation_result_to_formatted_json
@@ -458,10 +463,16 @@ def process_ocr_results(
annotation_result,
None,
False,
- is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing,
+ is_OCRSegmentCategorization
+ or is_OCRSegmentCategorizationEditing
+ or is_OCRTextlineSegmentation,
False,
)
- if is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing:
+ if (
+ is_OCRSegmentCategorization
+ or is_OCRSegmentCategorizationEditing
+ or is_OCRTextlineSegmentation
+ ):
bboxes_relation_json = []
for ann in annotation_result:
if "type" in ann and ann["type"] == "relation":
@@ -485,6 +496,7 @@ def process_task(
include_input_data_metadata_json,
dataset_model,
is_audio_project_type,
+ fetch_parent_data_field,
):
task_dict = model_to_dict(task)
if export_type != "JSON":
@@ -519,6 +531,21 @@ def process_task(
task_dict["data"]["input_data_metadata_json"] = dataset_model.objects.get(
pk=task_dict["input_data"]
).metadata_json
+ try:
+ if fetch_parent_data_field and dataset_model:
+ parent_data_item = dataset_model.objects.get(
+ pk=task_dict["input_data"]
+ ).parent_data
+ if parent_data_item:
+ dataset_model = getattr(
+ dataset_models, parent_data_item.instance_id.dataset_type
+ )
+ parent_dataset_model = dataset_model.objects.get(pk=parent_data_item.id)
+ task_dict["data"]["fetch_parent_data_field"] = getattr(
+ parent_dataset_model, fetch_parent_data_field, None
+ )
+ except Exception as e:
+ pass
del task_dict["annotation_users"]
del task_dict["review_user"]
diff --git a/backend/projects/views.py b/backend/projects/views.py
index 68e48c472..7bf98ede3 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -54,6 +54,7 @@
from .models import *
from .registry_helper import ProjectRegistry
from dataset import models as dataset_models
+from django.db.models import Exists, OuterRef
from dataset.models import (
DatasetInstance,
@@ -402,6 +403,7 @@ def get_review_reports(proj_id, userid, start_date, end_date):
"OCRTranscription",
"OCRSegmentCategorization",
"OCRSegmentCategorizationEditing",
+ "OCRTextlineSegmentation",
]:
result["Total Word Count"] = total_word_count
elif proj_type in get_audio_project_types():
@@ -650,15 +652,16 @@ def get_supercheck_reports(proj_id, userid, start_date, end_date):
"OCRTranscription",
"OCRSegmentCategorization",
"OCRSegmentCategorizationEditing",
+ "OCRTextlineSegmentation",
]:
result["Validated Word Count"] = validated_word_count
result["Validated With Changes Word Count"] = validated_with_changes_word_count
result["Rejected Word Count"] = rejected_word_count
elif proj_type in get_audio_project_types():
result["Validated Segments Duration"] = validated_audio_duration
- result[
- "Validated With Changes Segments Duration"
- ] = validated_with_changes_audio_duration
+ result["Validated With Changes Segments Duration"] = (
+ validated_with_changes_audio_duration
+ )
result["Rejected Segments Duration"] = rejected_audio_duration
result["Total Raw Audio Duration"] = total_raw_audio_duration
result["Average Word Error Rate R/S"] = round(avg_word_error_rate, 2)
@@ -854,69 +857,152 @@ def get_task_count_unassigned(pk, user):
return len(proj_tasks_unassigned)
-def convert_prediction_json_to_annotation_result(pk, proj_type):
+def convert_prediction_json_to_annotation_result(
+ pk, proj_type, data_item, prediction_json, populate_draft_data=False
+):
result = []
if (
proj_type == "AudioTranscriptionEditing"
or proj_type == "AcousticNormalisedTranscriptionEditing"
):
- data_item = SpeechConversation.objects.get(pk=pk)
- prediction_json = (
- json.loads(data_item.prediction_json)
- if isinstance(data_item.prediction_json, str)
- else data_item.prediction_json
- )
+ if not data_item and not prediction_json:
+ data_item = SpeechConversation.objects.get(pk=pk)
+ prediction_json = (
+ json.loads(data_item.prediction_json)
+ if isinstance(data_item.prediction_json, str)
+ else data_item.prediction_json
+ )
+ assert type(prediction_json) in [
+ dict,
+ list,
+ ], "Seems something is wrong with the formatting"
+ # see if the prediction is a list, then it seems that only verbatim json is present
+ if isinstance(prediction_json, list):
+ prediction_json = {"verbatim_transcribed_json": prediction_json}
+
speakers_json = data_item.speakers_json
audio_duration = data_item.audio_duration
# converting prediction_json to result (wherever it exists) for every task.
if prediction_json == None:
return result
- for idx, val in enumerate(prediction_json):
- label_dict = {
- "origin": "manual",
- "to_name": "audio_url",
- "from_name": "labels",
- "original_length": audio_duration,
- }
- text_dict = {
- "origin": "manual",
- "to_name": "audio_url",
- "from_name": "transcribed_json",
- "original_length": audio_duration,
- }
- if proj_type == "AcousticNormalisedTranscriptionEditing":
- text_dict["from_name"] = "verbatim_transcribed_json"
- id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
- label_dict["id"] = id
- text_dict["id"] = id
- label_dict["type"] = "labels"
- text_dict["type"] = "textarea"
-
- value_labels = {
- "start": val["start"],
- "end": val["end"],
- "labels": [
- next(
- speaker
- for speaker in speakers_json
- if speaker["speaker_id"] == val["speaker_id"]
- )["name"]
- ],
- }
- value_text = {
- "start": val["start"],
- "end": val["end"],
- "text": [val["text"]],
- }
+ # for pred_type, pred_json in prediction_json.items():
+ if "acoustic_normalised_transcribed_json" in prediction_json.keys():
+ for idx, (val, val_acoustic) in enumerate(
+ zip(
+ prediction_json["verbatim_transcribed_json"],
+ prediction_json["acoustic_normalised_transcribed_json"],
+ )
+ ):
+ label_dict = {
+ "origin": "manual",
+ "to_name": "audio_url",
+ "from_name": "labels",
+ "original_length": audio_duration,
+ }
+ text_dict = {
+ "origin": "manual",
+ "to_name": "audio_url",
+ "from_name": "transcribed_json",
+ "original_length": audio_duration,
+ }
+ text_dict_acoustic = {
+ "origin": "manual",
+ "to_name": "audio_url",
+ "from_name": "transcribed_json",
+ "original_length": audio_duration,
+ }
+ if proj_type == "AcousticNormalisedTranscriptionEditing":
+ text_dict["from_name"] = "verbatim_transcribed_json"
+ text_dict_acoustic["from_name"] = (
+ "acoustic_normalised_transcribed_json"
+ )
- label_dict["value"] = value_labels
- text_dict["value"] = value_text
- # mainly label_dict and text_dict are sent as result
- result.append(label_dict)
- result.append(text_dict)
+ id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
+ label_dict["id"] = id
+ text_dict["id"] = id
+ text_dict_acoustic["id"] = id
+
+ label_dict["type"] = "labels"
+ text_dict["type"] = "textarea"
+ text_dict_acoustic["type"] = "textarea"
+
+ value_labels = {
+ "start": val["start"],
+ "end": val["end"],
+ "labels": [
+ next(
+ speaker
+ for speaker in speakers_json
+ if speaker["speaker_id"] == val["speaker_id"]
+ )["name"]
+ ],
+ }
+ value_text = {
+ "start": val["start"],
+ "end": val["end"],
+ "text": [val["text"]],
+ }
+ value_text_acoustic = {
+ "start": val_acoustic["start"],
+ "end": val_acoustic["end"],
+ "text": [val_acoustic["text"]],
+ }
+
+ label_dict["value"] = value_labels
+ text_dict["value"] = value_text
+ text_dict_acoustic["value"] = value_text_acoustic
+ # mainly label_dict and text_dict are sent as result
+ result.append(label_dict)
+ result.append(text_dict)
+ result.append(text_dict_acoustic)
+ else:
+ for idx, val in enumerate(prediction_json["verbatim_transcribed_json"]):
+ label_dict = {
+ "origin": "manual",
+ "to_name": "audio_url",
+ "from_name": "labels",
+ "original_length": audio_duration,
+ }
+ text_dict = {
+ "origin": "manual",
+ "to_name": "audio_url",
+ "from_name": "transcribed_json",
+ "original_length": audio_duration,
+ }
+ if proj_type == "AcousticNormalisedTranscriptionEditing":
+ text_dict["from_name"] = "verbatim_transcribed_json"
+ id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
+ label_dict["id"] = id
+ text_dict["id"] = id
+ label_dict["type"] = "labels"
+ text_dict["type"] = "textarea"
+
+ value_labels = {
+ "start": val["start"],
+ "end": val["end"],
+ "labels": [
+ next(
+ speaker
+ for speaker in speakers_json
+ if speaker["speaker_id"] == val["speaker_id"]
+ )["name"]
+ ],
+ }
+ value_text = {
+ "start": val["start"],
+ "end": val["end"],
+ "text": [val["text"]],
+ }
+
+ label_dict["value"] = value_labels
+ text_dict["value"] = value_text
+ # mainly label_dict and text_dict are sent as result
+ result.append(label_dict)
+ result.append(text_dict)
elif (
proj_type == "OCRTranscriptionEditing"
or proj_type == "OCRSegmentCategorizationEditing"
+ or proj_type == "OCRTextlineSegmentation"
):
data_item = OCRDocument.objects.get(pk=pk)
ocr_prediction_json = (
@@ -993,7 +1079,7 @@ def convert_annotation_result_to_formatted_json(
annotation_result,
speakers_json,
is_SpeechConversation,
- is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing,
+ is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation,
is_acoustic=False,
):
transcribed_json = []
@@ -1089,14 +1175,18 @@ def convert_annotation_result_to_formatted_json(
acoustic_transcribed_json, ensure_ascii=False
)
else:
- dicts = 2 if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing else 3
+ dicts = (
+ 2
+ if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation
+ else 3
+ )
for idx1 in range(0, len(annotation_result), dicts):
rectangle_dict = {}
labels_dict = {}
text_dict = {}
if isinstance(annotation_result[idx1], str):
annotation_result[idx1] = json.loads(annotation_result[idx1])
- if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing:
+ if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation:
custom_text_dict = {"value": {"text": ""}}
text_dict = json.dumps(custom_text_dict, indent=2)
for idx2 in range(idx1, idx1 + dicts):
@@ -2130,9 +2220,9 @@ def create(self, request, *args, **kwargs):
if automatic_annotation_creation_mode != None:
if proj.metadata_json == None:
proj.metadata_json = {}
- proj.metadata_json[
- "automatic_annotation_creation_mode"
- ] = automatic_annotation_creation_mode
+ proj.metadata_json["automatic_annotation_creation_mode"] = (
+ automatic_annotation_creation_mode
+ )
if proj.project_type == "AcousticNormalisedTranscriptionEditing":
if proj.metadata_json == None:
proj.metadata_json = {}
@@ -2302,13 +2392,29 @@ def assign_new_tasks(self, request, pk, *args, **kwargs):
annotation_status__exact=UNLABELED, completed_by=cur_user
)
annotation_tasks = [anno.task.id for anno in proj_annotations]
- pending_tasks = (
- Task.objects.filter(project_id=pk)
- .filter(annotation_users=cur_user.id)
- .filter(task_status__in=[INCOMPLETE, UNLABELED])
- .filter(id__in=annotation_tasks)
- .count()
- )
+ if project.project_type in get_audio_project_types():
+ pending_tasks = (
+ Task.objects.filter(project_id=pk)
+ .filter(annotation_users=cur_user.id)
+ .filter(task_status__in=[INCOMPLETE, UNLABELED])
+ .filter(id__in=annotation_tasks)
+ .filter(
+ Exists(
+ SpeechConversation.objects.filter(
+ id=OuterRef("input_data_id"), freeze_task=False
+ )
+ )
+ )
+ .count()
+ )
+ else:
+ pending_tasks = (
+ Task.objects.filter(project_id=pk)
+ .filter(annotation_users=cur_user.id)
+ .filter(task_status__in=[INCOMPLETE, UNLABELED])
+ .filter(id__in=annotation_tasks)
+ .count()
+ )
# assigned_tasks_queryset = Task.objects.filter(project_id=pk).filter(annotation_users=cur_user.id)
# assigned_tasks = assigned_tasks_queryset.count()
# completed_tasks = Annotation_model.objects.filter(task__in=assigned_tasks_queryset).filter(completed_by__exact=cur_user.id).count()
@@ -2345,6 +2451,15 @@ def assign_new_tasks(self, request, pk, *args, **kwargs):
.exclude(annotation_users=cur_user.id)
.annotate(annotator_count=Count("annotation_users"))
)
+ if project.project_type in get_audio_project_types():
+ tasks = tasks.filter(
+ Exists(
+ SpeechConversation.objects.filter(
+ id=OuterRef("input_data_id"), freeze_task=False
+ )
+ )
+ )
+
tasks = tasks.filter(
annotator_count__lt=project.required_annotators_per_task
).distinct()
@@ -2365,10 +2480,11 @@ def assign_new_tasks(self, request, pk, *args, **kwargs):
"AudioTranscriptionEditing",
"OCRTranscriptionEditing",
"OCRSegmentCategorizationEditing",
+ "OCRTextlineSegmentation",
]:
try:
result = convert_prediction_json_to_annotation_result(
- task.input_data.id, project.project_type
+ task.input_data.id, project.project_type, None, None, False
)
except Exception as e:
print(
@@ -2611,6 +2727,7 @@ def assign_new_review_tasks(self, request, pk, *args, **kwargs):
except Exception as e:
continue
# check if the project contains eligible tasks to pull
+
tasks = (
Task.objects.filter(project_id=pk)
.filter(task_status=ANNOTATED)
@@ -2618,6 +2735,16 @@ def assign_new_review_tasks(self, request, pk, *args, **kwargs):
.exclude(annotation_users=cur_user.id)
.distinct()
)
+
+ if project.project_type in get_audio_project_types():
+ tasks = tasks.filter(
+ Exists(
+ SpeechConversation.objects.filter(
+ id=OuterRef("input_data_id"), freeze_task=False
+ )
+ )
+ )
+
if not tasks:
project.release_lock(REVIEW_LOCK)
return Response(
@@ -2820,6 +2947,16 @@ def assign_new_supercheck_tasks(self, request, pk, *args, **kwargs):
.exclude(review_user=cur_user.id)
.distinct()
)
+
+ if project.project_type in get_audio_project_types():
+ tasks = tasks.filter(
+ Exists(
+ SpeechConversation.objects.filter(
+ id=OuterRef("input_data_id"), freeze_task=False
+ )
+ )
+ )
+
if not tasks:
project.release_lock(SUPERCHECK_LOCK)
return Response(
@@ -4048,7 +4185,9 @@ def download(self, request, pk=None, *args, **kwargs):
try:
project = Project.objects.get(pk=pk)
project_type = dict(PROJECT_TYPE_CHOICES)[project.project_type]
-
+ fetch_parent_data_field = request.query_params.get(
+ "fetch_parent_data_field", None
+ )
include_input_data_metadata_json = request.query_params.get(
"include_input_data_metadata_json", False
)
@@ -4090,6 +4229,7 @@ def download(self, request, pk=None, *args, **kwargs):
is_OCRSegmentCategorizationEditing = (
project_type == "OCRSegmentCategorizationEditing"
)
+ is_OCRTextlineSegmentation = project_type == "OCRTextlineSegmentation"
is_OCRSegmentCategorization = project_type == "OCRSegmentCategorization"
for task in tasks:
try:
@@ -4099,6 +4239,7 @@ def download(self, request, pk=None, *args, **kwargs):
include_input_data_metadata_json,
dataset_model,
is_audio_project_type,
+ fetch_parent_data_field,
)
if (
is_ConversationTranslation
@@ -4121,6 +4262,7 @@ def download(self, request, pk=None, *args, **kwargs):
curr_task,
is_OCRSegmentCategorization,
is_OCRSegmentCategorizationEditing,
+ is_OCRTextlineSegmentation,
)
except Exception as e:
continue
diff --git a/backend/shoonya_backend/celery.py b/backend/shoonya_backend/celery.py
index b75b761e3..ed741f516 100644
--- a/backend/shoonya_backend/celery.py
+++ b/backend/shoonya_backend/celery.py
@@ -1,6 +1,7 @@
from __future__ import absolute_import, unicode_literals
from datetime import timedelta
from celery.schedules import crontab
+from celery.signals import worker_ready
import os
from celery import Celery
@@ -38,8 +39,24 @@
"task": "check_size",
"schedule": crontab(minute=0, hour=0), # every mid night
},
+ "fetchTaskCounts": {
+ "task": "fetchTaskCounts",
+ "schedule": crontab(minute=0, hour="*/1"),
+ },
+ "fetchWorkspaceTaskCounts": {
+ "task": "fetchWorkspaceTaskCounts",
+ "schedule": crontab(minute=0, hour="*/1"),
+ },
}
+
+@worker_ready.connect
+def at_start(sender, **k):
+ with sender.app.connection() as conn:
+ sender.app.send_task("fetchTaskCounts", connection=conn)
+ sender.app.send_task("fetchWorkspaceTaskCounts", connection=conn)
+
+
# Celery Task related settings
celery_app.autodiscover_tasks()
diff --git a/backend/shoonya_backend/settings.py b/backend/shoonya_backend/settings.py
index 2915ce894..281194cb8 100644
--- a/backend/shoonya_backend/settings.py
+++ b/backend/shoonya_backend/settings.py
@@ -42,6 +42,7 @@
"0.0.0.0",
"backend.shoonya.ai4bharat.org",
"backend.shoonya2.ai4bharat.org",
+ "127.0.0.1"
]
# Application definition
diff --git a/backend/tasks/models.py b/backend/tasks/models.py
index 7fac09c7a..75f8639dd 100644
--- a/backend/tasks/models.py
+++ b/backend/tasks/models.py
@@ -271,6 +271,16 @@ class Meta:
)
+class Statistic(models.Model):
+ stat_type = models.CharField(max_length=255)
+ org_id = models.IntegerField()
+ result = models.JSONField()
+ updated_at = models.DateTimeField(auto_now=True)
+
+ class Meta:
+ unique_together = ("stat_type", "org_id")
+
+
class Prediction(models.Model):
"""ML predictions"""
diff --git a/backend/tasks/urls.py b/backend/tasks/urls.py
index d4f229e6f..1df41e6cf 100644
--- a/backend/tasks/urls.py
+++ b/backend/tasks/urls.py
@@ -6,6 +6,7 @@
AnnotationViewSet,
PredictionViewSet,
get_celery_tasks,
+ TransliterationAPIView,
)
router = routers.DefaultRouter()
@@ -15,4 +16,9 @@
urlpatterns = [
path("get_celery_tasks/", get_celery_tasks),
+ path(
+ "xlit-api/generic/transliteration//",
+ TransliterationAPIView.as_view(),
+ name="transliteration-api",
+ ),
] + router.urls
diff --git a/backend/tasks/views.py b/backend/tasks/views.py
index 43ebf9339..0e889d6f1 100644
--- a/backend/tasks/views.py
+++ b/backend/tasks/views.py
@@ -53,6 +53,7 @@
import sacrebleu
from utils.date_time_conversions import utc_to_ist
+from rest_framework.views import APIView
# Create your views here.
@@ -1753,13 +1754,17 @@ def partial_update(self, request, pk=None):
== "AcousticNormalisedTranscriptionEditing"
else False
)
- is_ocr_sc_or_sce = (
+ is_ocr_sc_or_sce_or_ts = (
True
if annotation_obj.task.project_id.project_type
- in ["OCRSegmentCategorization", "OCRSegmentCategorizationEditing"]
+ in [
+ "OCRSegmentCategorization",
+ "OCRSegmentCategorizationEditing",
+ "OCRTextlineSegmentation",
+ ]
else False
)
- if is_ocr_sc_or_sce and (
+ if is_ocr_sc_or_sce_or_ts and (
"language" in request.data or "ocr_domain" in request.data
):
language = request.data.get("languages", [])
@@ -1785,11 +1790,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- == 1,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ == 1
+ ),
)
else:
annotation_obj.result = request.data["result"]
@@ -1841,11 +1850,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- == 1,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ == 1
+ ),
)
annotation_status = request.data["annotation_status"]
if empty_flag == True and annotation_status in [
@@ -1914,11 +1927,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- <= 2,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ <= 2
+ ),
)
else:
annotation_obj.result = request.data["result"]
@@ -2009,11 +2026,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- <= 2,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ <= 2
+ ),
)
annotation_status = request.data["annotation_status"]
if empty_flag == True and annotation_status in [
@@ -2109,11 +2130,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- <= 3,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ <= 3
+ ),
)
else:
annotation_obj.result = request.data["result"]
@@ -2195,11 +2220,15 @@ def partial_update(self, request, pk=None):
request.data["result"],
annotation_obj.task,
is_acoustic_project_type,
- is_acoustic_project_type
- and annotation_obj.task.project_id.metadata_json[
- "acoustic_enabled_stage"
- ]
- <= 3,
+ (
+ is_acoustic_project_type
+ and "acoustic_enabled_stage"
+ in annotation_obj.task.project_id.metadata_json
+ and annotation_obj.task.project_id.metadata_json[
+ "acoustic_enabled_stage"
+ ]
+ <= 3
+ ),
)
if empty_flag == True and annotation_status in [
LABELED,
@@ -2642,3 +2671,16 @@ def get_celery_tasks(request):
page_size = int(request.GET.get("page_size", 10))
data = paginate_queryset(filtered_tasks, page_number, page_size)
return JsonResponse(data["results"], safe=False)
+
+
+class TransliterationAPIView(APIView):
+ permission_classes = [IsAuthenticated]
+
+ def get(self, request, target_language, data, *args, **kwargs):
+ response_transliteration = requests.get(
+ os.getenv("TRANSLITERATION_URL") + target_language + "/" + data,
+ headers={"Authorization": "Bearer " + os.getenv("TRANSLITERATION_KEY")},
+ )
+
+ transliteration_output = response_transliteration.json()
+ return Response(transliteration_output, status=status.HTTP_200_OK)
diff --git a/backend/user_reports.py b/backend/user_reports.py
index 41ef1f7a0..72ece4961 100644
--- a/backend/user_reports.py
+++ b/backend/user_reports.py
@@ -18,6 +18,32 @@
from django.conf import settings
from pretty_html_table import build_table
import numpy as np
+from tasks.models import Statistic
+from django.db import connection
+
+
+def checkNoneValue(value):
+ if value == None:
+ return "0.0"
+ return value
+
+
+def checkLangNone(language):
+ if language == None:
+ return "Others"
+ return language
+
+
+def upsert_stat(stat_type, org_id, result):
+ obj, created = Statistic.objects.update_or_create(
+ stat_type=stat_type,
+ org_id=org_id,
+ defaults={
+ "result": result,
+ },
+ )
+
+ return obj, created
def calculate_reports():
@@ -330,3 +356,553 @@ def calculate_reports():
[user.email],
html_message=email_to_send,
)
+
+
+def fetch_task_counts():
+ org_ids = [1, 2, 3]
+ project_types = [
+ "AcousticNormalisedTranscriptionEditing",
+ "AudioSegmentation",
+ "AudioTranscription",
+ "AudioTranscriptionEditing",
+ "StandardisedTranscriptionEditing",
+ "ContextualSentenceVerification",
+ "ContextualSentenceVerificationAndDomainClassification",
+ "ContextualTranslationEditing",
+ "ConversationTranslation",
+ "ConversationTranslationEditing",
+ "ConversationVerification",
+ "MonolingualTranslation",
+ "OCRTranscriptionEditing",
+ "SemanticTextualSimilarity_Scale5",
+ "SentenceSplitting",
+ "TranslationEditing",
+ "OCRSegmentCategorizationEditing",
+ ]
+
+ with connection.cursor() as cursor:
+
+ for org in org_ids:
+
+ final_result_for_all__types = {}
+
+ for pjt_type in project_types:
+
+ sql_query = f"""
+ with annotation_tasks (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('annotated','reviewed','super_checked')
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ ),reviewer_tasks (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('reviewed','super_checked')
+ AND pjt.project_stage in (2,3)
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ )
+ ,superchecker_tasks (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('super_checked')
+ AND pjt.project_stage in (3)
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ ),
+ annotation_tasks_exported (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('exported')
+ AND pjt.project_stage in (1)
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ ), reviewer_tasks_exported (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('exported')
+ AND pjt.project_stage in (2)
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ ), supercheck_tasks_exported (language,count) as
+ (
+ SELECT
+ pjt.tgt_language,
+ count(tsk.id)
+ FROM
+ tasks_task AS tsk,
+ projects_project AS pjt
+ WHERE
+ tsk.project_id_id = pjt.id
+ AND tsk.task_status in ('exported')
+ AND pjt.project_stage in (3)
+ AND pjt.project_type in ('{pjt_type}')
+ AND pjt.organization_id_id = {org}
+ GROUP BY
+ pjt.tgt_language
+ ),
+ reviewer_tasks_count (language,count,tag) as (
+ SELECT
+ language,
+ SUM(count) as task_count,
+ 'rew'
+ FROM (
+ SELECT language, count FROM reviewer_tasks
+ UNION ALL
+ SELECT language, count FROM reviewer_tasks_exported
+ UNION ALL
+ SELECT language, count FROM supercheck_tasks_exported
+ ) AS merged_tables
+ GROUP BY language
+ ),
+ annotation_tasks_count (language,count,tag) as (
+ SELECT
+ language,
+ SUM(count) as task_count,
+ 'ann'
+ FROM (
+ SELECT language, count FROM annotation_tasks
+ UNION ALL
+ SELECT language, count FROM annotation_tasks_exported
+ UNION ALL
+ SELECT language, count FROM reviewer_tasks_exported
+ UNION ALL
+ SELECT language, count FROM supercheck_tasks_exported
+ ) AS merged_tables
+ GROUP BY language
+ ),
+ supercheck_tasks_count (language,count,tag) as (
+ SELECT
+ language,
+ SUM(count) as task_count,
+ 'sup'
+ FROM (
+ SELECT language, count FROM superchecker_tasks
+ UNION ALL
+ SELECT language, count FROM supercheck_tasks_exported
+ ) AS merged_tables
+ GROUP BY language
+ ),
+ cumulative_task_counts (language,count,tag) as (
+ select language,count,tag from annotation_tasks_count
+ union all
+ select language,count,tag from reviewer_tasks_count
+ union all
+ select language,count,tag from supercheck_tasks_count
+ )
+ SELECT
+ language,
+ SUM(CASE WHEN tag = 'ann' THEN count ELSE 0 END) AS annotation_count,
+ SUM(CASE WHEN tag = 'rew' THEN count ELSE 0 END) AS reviewer_count,
+ SUM(CASE WHEN tag = 'sup' THEN count ELSE 0 END) AS superchecker_count
+ FROM cumulative_task_counts
+ GROUP BY language;
+ """
+ cursor.execute(sql=sql_query)
+ result = cursor.fetchall()
+ formatted_result = []
+ for langResult in result:
+ ann, rev, sup = langResult[1:]
+ formatted_result.append(
+ {
+ "language": checkLangNone(langResult[0]),
+ "ann_cumulative_tasks_count": int(str(ann)),
+ "rew_cumulative_tasks_count": int(str(rev)),
+ "sup_cumulative_tasks_count": int(str(sup)),
+ }
+ )
+ final_result_for_all__types[pjt_type] = formatted_result
+ upsert_stat("task_count", org, final_result_for_all__types)
+
+
+def fetch_workspace_task_counts():
+
+ sql_query = """
+ WITH
+ ANNOTATION_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('annotated', 'reviewed', 'super_checked')
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ REVIEWER_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('reviewed', 'super_checked')
+ AND PJT.PROJECT_STAGE IN (2, 3)
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ SUPERCHECKER_TASKS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('super_checked')
+ AND PJT.PROJECT_STAGE IN (3)
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ ANNOTATION_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('exported')
+ AND PJT.PROJECT_STAGE IN (1)
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ REVIEWER_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('exported')
+ AND PJT.PROJECT_STAGE IN (2)
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ SUPERCHECK_TASKS_EXPORTED (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT) AS (
+ SELECT
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID,
+ COUNT(TSK.ID)
+ FROM
+ TASKS_TASK AS TSK,
+ PROJECTS_PROJECT AS PJT
+ WHERE
+ TSK.PROJECT_ID_ID = PJT.ID
+ AND TSK.TASK_STATUS IN ('exported')
+ AND PJT.PROJECT_STAGE IN (3)
+ GROUP BY
+ PJT.TGT_LANGUAGE,
+ PJT.PROJECT_TYPE,
+ PJT.WORKSPACE_ID_ID
+ ),
+ REVIEWER_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ SUM(COUNT) AS TASK_COUNT,
+ 'rew'
+ FROM
+ (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ REVIEWER_TASKS
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ REVIEWER_TASKS_EXPORTED
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ SUPERCHECK_TASKS_EXPORTED
+ ) AS MERGED_TABLES
+ GROUP BY
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID
+ ),
+ ANNOTATION_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ SUM(COUNT) AS TASK_COUNT,
+ 'ann'
+ FROM
+ (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ ANNOTATION_TASKS
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ ANNOTATION_TASKS_EXPORTED
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ REVIEWER_TASKS_EXPORTED
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ SUPERCHECK_TASKS_EXPORTED
+ ) AS MERGED_TABLES
+ GROUP BY
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID
+ ),
+ SUPERCHECK_TASKS_COUNT (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ SUM(COUNT) AS TASK_COUNT,
+ 'sup'
+ FROM
+ (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ SUPERCHECKER_TASKS
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT
+ FROM
+ SUPERCHECK_TASKS_EXPORTED
+ ) AS MERGED_TABLES
+ GROUP BY
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID
+ ),
+ CUMULATIVE_TASK_COUNTS (LANGUAGE, PROJECT_TYPE, WORKSPACE_ID, COUNT, TAG) AS (
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT,
+ TAG
+ FROM
+ ANNOTATION_TASKS_COUNT
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT,
+ TAG
+ FROM
+ REVIEWER_TASKS_COUNT
+ UNION ALL
+ SELECT
+ LANGUAGE,
+ PROJECT_TYPE,
+ WORKSPACE_ID,
+ COUNT,
+ TAG
+ FROM
+ SUPERCHECK_TASKS_COUNT
+ ),
+ WORKSPACE_COUNTS (
+ WORKSPACE_ID,
+ LANGUAGE,
+ PROJECT_TYPE,
+ ANNOTATION_COUNT,
+ REVIEWER_COUNT,
+ SUPERCHECKER_COUNT
+ ) AS (
+ SELECT
+ CTC.WORKSPACE_ID,
+ COALESCE(CTC.LANGUAGE, 'Others'),
+ CTC.PROJECT_TYPE,
+ SUM(
+ CASE
+ WHEN TAG = 'ann' THEN CTC.COUNT
+ ELSE 0
+ END
+ ) AS ANNOTATION_COUNT,
+ SUM(
+ CASE
+ WHEN TAG = 'rew' THEN CTC.COUNT
+ ELSE 0
+ END
+ ) AS REVIEWER_COUNT,
+ SUM(
+ CASE
+ WHEN TAG = 'sup' THEN CTC.COUNT
+ ELSE 0
+ END
+ ) AS SUPERCHECKER_COUNT
+ FROM
+ CUMULATIVE_TASK_COUNTS AS CTC
+ JOIN WORKSPACES_WORKSPACE AS WSP ON WSP.ID = CTC.WORKSPACE_ID
+ GROUP BY
+ CTC.LANGUAGE,
+ CTC.PROJECT_TYPE,
+ CTC.WORKSPACE_ID,
+ WSP.WORKSPACE_NAME
+ ),
+ AGGREGATED_DATA (WORKSPACE_ID, PROJECT_TYPE, PROJECT_DATA) AS (
+ SELECT
+ WORKSPACE_ID,
+ PROJECT_TYPE,
+ JSON_AGG(
+ JSON_BUILD_OBJECT(
+ 'language',
+ LANGUAGE,
+ 'ann_cumulative_tasks_count',
+ ANNOTATION_COUNT,
+ 'rew_cumulative_tasks_count',
+ REVIEWER_COUNT,
+ 'sup_cumulative_tasks_count',
+ SUPERCHECKER_COUNT
+ )
+ ) AS PROJECT_DATA
+ FROM
+ WORKSPACE_COUNTS
+ GROUP BY
+ PROJECT_TYPE,
+ WORKSPACE_ID
+ ),
+ WORKSPACE_TASK_COUNTS (WORKSPACE_ID, ORGANIZATION_ID, RESULT) AS (
+ SELECT
+ ADT.WORKSPACE_ID,
+ WSP.ORGANIZATION_ID,
+ JSON_OBJECT_AGG(ADT.PROJECT_TYPE, ADT.PROJECT_DATA) AS RESULT
+ FROM
+ AGGREGATED_DATA AS ADT
+ JOIN WORKSPACES_WORKSPACE AS WSP ON WSP.ID = ADT.WORKSPACE_ID
+ GROUP BY
+ ADT.WORKSPACE_ID,
+ WSP.ORGANIZATION_ID
+ )
+SELECT
+ ORGANIZATION_ID,
+ JSONB_OBJECT_AGG(WORKSPACE_ID, RESULT) as workspace_task_counts
+FROM
+ WORKSPACE_TASK_COUNTS
+GROUP BY
+ ORGANIZATION_ID
+ """
+ with connection.cursor() as cursor:
+
+ cursor.execute(sql=sql_query)
+ result = cursor.fetchall()
+ for org_id, workspace_task_counts in result:
+ workspace_task_counts = json.loads(workspace_task_counts)
+ upsert_stat(
+ stat_type="workspace_task_counts",
+ org_id=org_id,
+ result=workspace_task_counts,
+ )
diff --git a/backend/users/tasks.py b/backend/users/tasks.py
index b83601409..cc2ef62ec 100644
--- a/backend/users/tasks.py
+++ b/backend/users/tasks.py
@@ -3,9 +3,28 @@
from django.core.mail import send_mail
from celery.schedules import crontab
from shoonya_backend.celery import celery_app
-from user_reports import calculate_reports
+from user_reports import (
+ calculate_reports,
+ fetch_task_counts,
+ fetch_workspace_task_counts,
+)
+from celery.utils.log import get_task_logger
+
+logger = get_task_logger(__name__)
@shared_task(name="send_mail_task")
def send_mail_task():
calculate_reports()
+
+
+@shared_task(name="fetchTaskCounts")
+def fetchTaskCounts():
+ fetch_task_counts()
+ logger.info("Completed Task Count Update")
+
+
+@shared_task(name="fetchWorkspaceTaskCounts")
+def fetchWorkspaceTaskCounts():
+ fetch_workspace_task_counts()
+ logger.info("Completed Workspace Task Count Update")
diff --git a/backend/utils/constants.py b/backend/utils/constants.py
new file mode 100644
index 000000000..69e6bf258
--- /dev/null
+++ b/backend/utils/constants.py
@@ -0,0 +1,51 @@
+LANG_CHOICES = (
+ ("English", "English"),
+ ("Assamese", "Assamese"),
+ ("Bengali", "Bengali"),
+ ("Bodo", "Bodo"),
+ ("Dogri", "Dogri"),
+ ("Gujarati", "Gujarati"),
+ ("Hindi", "Hindi"),
+ ("Kannada", "Kannada"),
+ ("Kashmiri", "Kashmiri"),
+ ("Konkani", "Konkani"),
+ ("Maithili", "Maithili"),
+ ("Malayalam", "Malayalam"),
+ ("Manipuri", "Manipuri"),
+ ("Marathi", "Marathi"),
+ ("Nepali", "Nepali"),
+ ("Odia", "Odia"),
+ ("Punjabi", "Punjabi"),
+ ("Sanskrit", "Sanskrit"),
+ ("Santali", "Santali"),
+ ("Sindhi", "Sindhi"),
+ ("Sinhala", "Sinhala"),
+ ("Tamil", "Tamil"),
+ ("Telugu", "Telugu"),
+ ("Urdu", "Urdu"),
+)
+
+lang_codes = {
+ "assamese": "as",
+ "bengali": "bn",
+ "bodo": "brx",
+ "dogri": "doi",
+ "gujarati": "gu",
+ "hindi": "hi",
+ "kannada": "kn",
+ "kashmiri": "ks",
+ "konkani": "kok",
+ "maithili": "mai",
+ "malayalam": "ml",
+ "manipuri": "mni",
+ "marathi": "mr",
+ "nepali": "ne",
+ "odia": "or",
+ "punjabi": "pa",
+ "sanskrit": "sa",
+ "santali": "sat",
+ "sindhi": "sd",
+ "tamil": "ta",
+ "telugu": "te",
+ "urdu": "ur",
+}
diff --git a/backend/utils/dataset_utils.py b/backend/utils/dataset_utils.py
new file mode 100644
index 000000000..0cf950fec
--- /dev/null
+++ b/backend/utils/dataset_utils.py
@@ -0,0 +1,15 @@
+def get_batch_dataset_upload_status(instance_ids):
+ """
+ Batch fetch upload status for a list of dataset instance IDs.
+ Replace this with actual logic to retrieve status from your database.
+ """
+ # Mock data for testing
+ status_data = {}
+ for instance_id in instance_ids:
+ status_data[instance_id] = {
+ "last_upload_status": "Completed",
+ "last_upload_date": "2023-01-01",
+ "last_upload_time": "12:00:00",
+ "last_upload_result": "Success",
+ }
+ return status_data
\ No newline at end of file
diff --git a/backend/utils/filter_tasks_by_ann_type.py b/backend/utils/filter_tasks_by_ann_type.py
new file mode 100644
index 000000000..3fb641293
--- /dev/null
+++ b/backend/utils/filter_tasks_by_ann_type.py
@@ -0,0 +1,47 @@
+from tasks.models import (
+ Annotation,
+ ANNOTATOR_ANNOTATION,
+ REVIEWER_ANNOTATION,
+ SUPER_CHECKER_ANNOTATION,
+ LABELED,
+ ACCEPTED,
+ ACCEPTED_WITH_MINOR_CHANGES,
+ ACCEPTED_WITH_MAJOR_CHANGES,
+ VALIDATED,
+ VALIDATED_WITH_CHANGES,
+)
+
+
+def filter_tasks_by_ann_type(annotation_tasks, reviewer_tasks, supercheck_tasks):
+ filtered_annotation_tasks, filtered_reviewer_tasks, filtered_supercheck_tasks = (
+ [],
+ [],
+ [],
+ )
+ for a in annotation_tasks:
+ anno = Annotation.objects.filter(
+ task=a, annotation_type=ANNOTATOR_ANNOTATION, annotation_status=LABELED
+ )[0]
+ if anno:
+ filtered_annotation_tasks.append(a)
+ for r in reviewer_tasks:
+ anno = Annotation.objects.filter(
+ task=r,
+ annotation_type=REVIEWER_ANNOTATION,
+ annotation_status__in=[
+ ACCEPTED,
+ ACCEPTED_WITH_MINOR_CHANGES,
+ ACCEPTED_WITH_MAJOR_CHANGES,
+ ],
+ )[0]
+ if anno:
+ filtered_reviewer_tasks.append(r)
+ for s in supercheck_tasks:
+ anno = Annotation.objects.filter(
+ task=s,
+ annotation_type=SUPER_CHECKER_ANNOTATION,
+ annotation_status__in=[VALIDATED, VALIDATED_WITH_CHANGES],
+ )[0]
+ if anno:
+ filtered_supercheck_tasks.append(s)
+ return filtered_annotation_tasks, filtered_reviewer_tasks, filtered_supercheck_tasks
diff --git a/backend/workspaces/views.py b/backend/workspaces/views.py
index e5cd136d9..2318b1866 100644
--- a/backend/workspaces/views.py
+++ b/backend/workspaces/views.py
@@ -8,7 +8,7 @@
from projects.models import Project, ANNOTATION_STAGE, REVIEW_STAGE, SUPERCHECK_STAGE
from users.models import User
from users.serializers import UserProfileSerializer
-from tasks.models import Task
+from tasks.models import Task, Statistic
from organizations.models import Organization
from django.db.models import Q
from projects.utils import no_of_words
@@ -61,7 +61,7 @@
get_review_reports,
get_supercheck_reports,
)
-
+from utils.filter_tasks_by_ann_type import filter_tasks_by_ann_type
# Create your views here.
@@ -1404,23 +1404,41 @@ def cumulative_tasks_count_all(self, request, pk=None):
other_lang = []
for lang in languages:
proj_lang_filter = proj_objs.filter(tgt_language=lang)
- annotation_tasks_count = 0
- reviewer_task_count = 0
+ annotation_tasks = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ task_status__in=[
+ "annotated",
+ "reviewed",
+ "super_checked",
+ ],
+ )
reviewer_tasks = Task.objects.filter(
project_id__in=proj_lang_filter,
project_id__project_stage__in=[REVIEW_STAGE, SUPERCHECK_STAGE],
- task_status__in=["reviewed", "exported", "super_checked"],
+ task_status__in=["reviewed", "super_checked"],
)
-
- annotation_tasks = Task.objects.filter(
+ supercheck_tasks = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[SUPERCHECK_STAGE],
+ task_status__in=["super_checked"],
+ )
+ annotation_tasks_exported = Task.objects.filter(
project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[ANNOTATION_STAGE],
task_status__in=[
- "annotated",
- "reviewed",
"exported",
- "super_checked",
],
)
+ reviewer_tasks_exported = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[REVIEW_STAGE],
+ task_status__in=["exported"],
+ )
+ supercheck_tasks_exported = Task.objects.filter(
+ project_id__in=proj_lang_filter,
+ project_id__project_stage__in=[SUPERCHECK_STAGE],
+ task_status__in=["exported"],
+ )
if metainfo == True:
result = {}
@@ -1636,15 +1654,30 @@ def cumulative_tasks_count_all(self, request, pk=None):
}
else:
- reviewer_task_count = reviewer_tasks.count()
-
- annotation_tasks_count = annotation_tasks.count()
-
- result = {
- "language": lang,
- "ann_cumulative_tasks_count": annotation_tasks_count,
- "rew_cumulative_tasks_count": reviewer_task_count,
- }
+ # reviewer_task_count = (
+ # reviewer_tasks.count()
+ # + reviewer_tasks_exported.count()
+ # + supercheck_tasks_exported.count()
+ # )
+
+ # annotation_tasks_count = (
+ # annotation_tasks.count()
+ # + annotation_tasks_exported.count()
+ # + reviewer_tasks_exported.count()
+ # + supercheck_tasks_exported.count()
+ # )
+
+ # supercheck_tasks_count = (
+ # supercheck_tasks.count() + supercheck_tasks_exported.count()
+ # )
+
+ # result = {
+ # "language": lang,
+ # "ann_cumulative_tasks_count": annotation_tasks_count,
+ # "rew_cumulative_tasks_count": reviewer_task_count,
+ # "sup_cumulative_tasks_count": supercheck_tasks_count,
+ # }
+ result = {}
if lang == None or lang == "":
other_lang.append(result)
@@ -1653,6 +1686,7 @@ def cumulative_tasks_count_all(self, request, pk=None):
ann_task_count = 0
rew_task_count = 0
+ sup_task_count = 0
ann_word_count = 0
rew_word_count = 0
ann_aud_dur = 0
@@ -1665,8 +1699,10 @@ def cumulative_tasks_count_all(self, request, pk=None):
rev_sentance_count = 0
for dat in other_lang:
if metainfo != True:
- ann_task_count += dat["ann_cumulative_tasks_count"]
- rew_task_count += dat["rew_cumulative_tasks_count"]
+ # ann_task_count += dat["ann_cumulative_tasks_count"]
+ # rew_task_count += dat["rew_cumulative_tasks_count"]
+ # sup_task_count += dat["sup_cumulative_tasks_count"]
+ pass
else:
if project_type in get_audio_project_types():
ann_aud_dur += convert_hours_to_seconds(
@@ -1706,11 +1742,13 @@ def cumulative_tasks_count_all(self, request, pk=None):
if len(other_lang) > 0:
if metainfo != True:
- other_language = {
- "language": "Others",
- "ann_cumulative_tasks_count": ann_task_count,
- "rew_cumulative_tasks_count": rew_task_count,
- }
+ other_language = {}
+ # other_language = {
+ # "language": "Others",
+ # "ann_cumulative_tasks_count": ann_task_count,
+ # "rew_cumulative_tasks_count": rew_task_count,
+ # "sup_cumulative_tasks_count": sup_task_count,
+ # }
else:
if project_type in get_audio_project_types():
other_language = {
@@ -1782,6 +1820,18 @@ def cumulative_tasks_count_all(self, request, pk=None):
pass
else:
final_result_for_all_types[project_type] = final_result
+
+ if metainfo != True:
+ workspace = Workspace.objects.get(pk=pk)
+ print(workspace.organization_id)
+ task_counts = list(
+ Statistic.objects.filter(
+ stat_type="workspace_task_counts", org_id=workspace.organization_id
+ )
+ )[0].result
+
+ for pjt_type in project_types:
+ final_result_for_all_types[pjt_type] = task_counts[str(pk)][pjt_type]
return Response(final_result_for_all_types)
@action(