diff --git a/main.py b/main.py index fbc960a..abcd213 100644 --- a/main.py +++ b/main.py @@ -125,6 +125,9 @@ async def create_datagram_endpoint(self, protocol_factory, # model threads model_threads = settings.model_threads +# Default to supporting chunking +has_chunking = True + # Try CUDA device = "cuda" if torch.cuda.is_available() else "cpu" @@ -151,6 +154,12 @@ async def create_datagram_endpoint(self, protocol_factory, logger.info(f'CUDA: Device {cuda_dev_num} total memory: {cuda_total_memory} bytes') logger.info(f'CUDA: Device {cuda_dev_num} free memory: {cuda_free_memory} bytes') + # Disable chunking if card has less than 10GB VRAM (complete guess) + # This can still encounter out of memory errors depending on audio length + if cuda_free_memory <= 10000000000: + logger.warning(f'CUDA: Device {cuda_dev_num} has low memory, disabling chunking support') + has_chunking = False + # Override compute_type if at least one non-Turing card if cuda_device_capability <= 70: logger.warning(f'CUDA: Device {cuda_dev_num} is pre-Turing, forcing int8') @@ -255,8 +264,11 @@ def do_whisper(audio_file, model, beam_size, task, detect_language, return_langu beam_size = long_beam_size use_chunking = False if audio_duration > 30*1000: - logger.debug(f'WHISPER: Audio duration is > 30s - activating chunking') - use_chunking = True + if has_chunking: + logger.debug(f'WHISPER: Audio duration is > 30s - activating chunking') + use_chunking = True + else: + logger.warning(f'WHISPER: Audio duration is > 30s but chunking is not available. Will truncate!') time_end = datetime.datetime.now() infer_time = time_end - first_time_start