From ca75f9746b762918bbbecd1d78b09226c7747002 Mon Sep 17 00:00:00 2001 From: array Date: Fri, 22 May 2026 16:41:50 +0800 Subject: [PATCH] feat: add AudioModalProcessor for speech-to-text transcription Add support for audio file processing (MP3, WAV, FLAC, M4A, OGG, etc.) using faster-whisper for local ASR transcription. Key features: - Timestamped transcription output for precise retrieval - VAD filtering to skip silence - Lazy model loading (only loads whisper when first audio is processed) - Configurable via WHISPER_MODEL and WHISPER_LANGUAGE env vars - Added as optional dependency: pip install raganything[audio] Use cases: meeting recordings, phone calls, podcasts, lectures. --- env.example | 6 +- pyproject.toml | 2 + raganything/__init__.py | 18 ++ raganything/config.py | 2 +- raganything/modalprocessors_audio.py | 323 +++++++++++++++++++++++++++ raganything/processor.py | 9 + tests/test_audio_processor.py | 202 +++++++++++++++++ 7 files changed, 560 insertions(+), 2 deletions(-) create mode 100644 raganything/modalprocessors_audio.py create mode 100644 tests/test_audio_processor.py diff --git a/env.example b/env.example index 84d774970..a1f27781a 100644 --- a/env.example +++ b/env.example @@ -50,9 +50,13 @@ OLLAMA_EMULATING_MODEL_TAG=latest # ENABLE_TABLE_PROCESSING=true # ENABLE_EQUATION_PROCESSING=true +### Audio Processing Configuration (requires: pip install raganything[audio]) +# WHISPER_MODEL=base # tiny/base/small/medium/large-v3 +# WHISPER_LANGUAGE= # Auto-detect if empty. Set to "zh", "en", etc. + ### Batch Processing Configuration # MAX_CONCURRENT_FILES=1 -# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md +# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md,.mp3,.wav,.flac,.m4a,.ogg # RECURSIVE_FOLDER_PROCESSING=true ### Context Extraction Configuration diff --git a/pyproject.toml b/pyproject.toml index e612fa4e4..7d81cd239 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ markdown = [ "weasyprint>=60.0", "pygments>=2.10.0", ] +audio = ["faster-whisper>=1.0.0"] all = [ "Pillow>=10.0.0", "reportlab>=4.0.0", @@ -48,6 +49,7 @@ all = [ "markdown>=3.4.0", "weasyprint>=60.0", "pygments>=2.10.0", + "faster-whisper>=1.0.0", ] [project.urls] diff --git a/raganything/__init__.py b/raganything/__init__.py index fa8efb10c..85b0cf538 100644 --- a/raganything/__init__.py +++ b/raganything/__init__.py @@ -43,6 +43,16 @@ except ImportError: pass +# Optional: audio modal processor (requires faster-whisper). +try: + from .modalprocessors_audio import ( + AudioModalProcessor as AudioModalProcessor, + is_audio_file as is_audio_file, + ) +except ImportError: + # faster-whisper not installed; audio processing unavailable. + pass + # Optional: multilingual prompt manager. try: from .prompt_manager import ( @@ -97,6 +107,14 @@ ] ) +if "AudioModalProcessor" in globals(): + __all__.extend( + [ + "AudioModalProcessor", + "is_audio_file", + ] + ) + if "set_prompt_language" in globals(): __all__.extend( [ diff --git a/raganything/config.py b/raganything/config.py index c1969b396..1d07e1ff2 100644 --- a/raganything/config.py +++ b/raganything/config.py @@ -63,7 +63,7 @@ class RAGAnythingConfig: x.strip() for x in get_env_value( "SUPPORTED_FILE_EXTENSIONS", - ".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md", + ".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md,.mp3,.wav,.flac,.m4a,.ogg,.wma,.aac,.opus", str, ).split(",") ] diff --git a/raganything/modalprocessors_audio.py b/raganything/modalprocessors_audio.py new file mode 100644 index 000000000..22e1b4bf1 --- /dev/null +++ b/raganything/modalprocessors_audio.py @@ -0,0 +1,323 @@ +""" +Audio Modal Processor for RAG-Anything + +Processes audio files (MP3, WAV, FLAC, M4A, OGG) by transcribing speech to text +using faster-whisper, then feeding the transcribed text into LightRAG's knowledge graph. + +Supports: +- Speech-to-text transcription with timestamps +- Meeting recordings, phone calls, podcasts, lectures +- Multiple languages (auto-detect or specify) + +Dependencies: + pip install raganything[audio] + # or: pip install faster-whisper +""" + +import json +import logging +import os +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Tuple + +from lightrag.utils import compute_mdhash_id + +from .modalprocessors import BaseModalProcessor +from .prompt import PROMPTS + +logger = logging.getLogger(__name__) + +# Supported audio file extensions +AUDIO_EXTENSIONS = {".mp3", ".wav", ".flac", ".m4a", ".ogg", ".wma", ".aac", ".opus"} + + +def is_audio_file(file_path: str) -> bool: + """Check if a file is a supported audio format.""" + return Path(file_path).suffix.lower() in AUDIO_EXTENSIONS + + +class AudioModalProcessor(BaseModalProcessor): + """Processor for audio content using faster-whisper for transcription. + + Transcribes audio files into timestamped text segments, then processes + them through LightRAG for knowledge graph construction and retrieval. + + Suitable for: + - Meeting recordings + - Phone call recordings + - Podcasts and interviews + - Lectures and presentations + - Voice memos + + Example: + >>> processor = AudioModalProcessor( + ... lightrag=rag_instance, + ... modal_caption_func=caption_func, + ... whisper_model="large-v3", + ... ) + >>> result = await processor.process_multimodal_content( + ... modal_content={"audio_path": "/path/to/meeting.mp3"}, + ... content_type="audio", + ... ) + """ + + def __init__( + self, + lightrag, + modal_caption_func, + context_extractor=None, + whisper_model: str = None, + whisper_device: str = "auto", + whisper_compute_type: str = "auto", + language: str = None, + segment_min_length: int = 30, + ): + """Initialize audio processor. + + Args: + lightrag: LightRAG instance + modal_caption_func: Function for generating descriptions + context_extractor: Context extractor instance + whisper_model: Whisper model size (tiny/base/small/medium/large-v3) + Defaults to env WHISPER_MODEL or "base" + whisper_device: Device for inference ("auto", "cpu", "cuda") + whisper_compute_type: Compute type ("auto", "float16", "int8") + language: Language code (e.g., "zh", "en"). None for auto-detect. + segment_min_length: Minimum segment length in characters to keep + """ + super().__init__(lightrag, modal_caption_func, context_extractor) + + self.whisper_model_name = whisper_model or os.environ.get( + "WHISPER_MODEL", "base" + ) + self.whisper_device = whisper_device + self.whisper_compute_type = whisper_compute_type + self.language = language or os.environ.get("WHISPER_LANGUAGE", None) + self.segment_min_length = segment_min_length + self._whisper_model = None + + @property + def whisper(self): + """Lazy-load whisper model on first use.""" + if self._whisper_model is None: + try: + from faster_whisper import WhisperModel + except ImportError: + raise ImportError( + "faster-whisper is required for audio processing. " + "Install it with: pip install raganything[audio] " + "or: pip install faster-whisper" + ) + + logger.info( + f"Loading whisper model: {self.whisper_model_name} " + f"(device={self.whisper_device})" + ) + self._whisper_model = WhisperModel( + self.whisper_model_name, + device=self.whisper_device, + compute_type=self.whisper_compute_type, + ) + return self._whisper_model + + def transcribe(self, audio_path: str) -> List[Dict[str, Any]]: + """Transcribe audio file to timestamped segments. + + Args: + audio_path: Path to the audio file + + Returns: + List of segments with start, end (seconds) and text + """ + if not Path(audio_path).exists(): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + logger.info(f"Transcribing audio: {audio_path}") + + segments_iter, info = self.whisper.transcribe( + audio_path, + language=self.language, + vad_filter=True, # Filter out silence + vad_parameters=dict(min_silence_duration_ms=500), + ) + + segments = [] + for segment in segments_iter: + text = segment.text.strip() + if len(text) >= self.segment_min_length: + segments.append( + { + "start": segment.start, + "end": segment.end, + "text": text, + } + ) + + logger.info( + f"Transcription complete: {len(segments)} segments, " + f"language={info.language}, duration={info.duration:.1f}s" + ) + return segments + + def _format_timestamp(self, seconds: float) -> str: + """Format seconds to HH:MM:SS or MM:SS.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + if h > 0: + return f"{h}:{m:02d}:{s:02d}" + return f"{m}:{s:02d}" + + def _segments_to_text(self, segments: List[Dict[str, Any]]) -> str: + """Convert transcription segments to formatted text with timestamps.""" + lines = [] + for seg in segments: + start_str = self._format_timestamp(seg["start"]) + end_str = self._format_timestamp(seg["end"]) + lines.append(f"[{start_str}-{end_str}] {seg['text']}") + return "\n".join(lines) + + async def generate_description_only( + self, + modal_content, + content_type: str, + item_info: Dict[str, Any] = None, + entity_name: str = None, + ) -> Tuple[str, Dict[str, Any]]: + """Generate audio transcription and entity info. + + Args: + modal_content: Audio content dict with 'audio_path' key + content_type: Type of modal content ("audio") + item_info: Item information for context extraction + entity_name: Optional predefined entity name + + Returns: + Tuple of (transcription_text, entity_info) + """ + try: + # Parse audio content + if isinstance(modal_content, str): + try: + content_data = json.loads(modal_content) + except json.JSONDecodeError: + content_data = {"audio_path": modal_content} + else: + content_data = modal_content + + audio_path = content_data.get("audio_path") or content_data.get("img_path") + if not audio_path: + raise ValueError( + f"No audio path provided in modal_content: {modal_content}" + ) + + # Transcribe + segments = self.transcribe(audio_path) + if not segments: + raise RuntimeError(f"No speech detected in audio: {audio_path}") + + # Format transcription + transcription = self._segments_to_text(segments) + + # Generate entity info + filename = Path(audio_path).stem + duration = segments[-1]["end"] if segments else 0 + entity_info = { + "entity_name": entity_name + if entity_name + else f"audio_{filename}", + "entity_type": "audio", + "summary": ( + f"Audio recording ({self._format_timestamp(duration)} duration). " + f"Transcription: {segments[0]['text'][:100]}..." + if segments + else "Empty audio" + ), + } + + return transcription, entity_info + + except Exception as e: + logger.error(f"Error generating audio transcription: {e}") + fallback_entity = { + "entity_name": entity_name + if entity_name + else f"audio_{compute_mdhash_id(str(modal_content))}", + "entity_type": "audio", + "summary": f"Audio content: {str(modal_content)[:100]}", + } + return str(modal_content), fallback_entity + + async def process_multimodal_content( + self, + modal_content, + content_type: str, + file_path: str = "manual_creation", + entity_name: str = None, + item_info: Dict[str, Any] = None, + batch_mode: bool = False, + doc_id: str = None, + chunk_order_index: int = 0, + ) -> Tuple[str, Dict[str, Any]]: + """Process audio content: transcribe and insert into knowledge graph. + + Args: + modal_content: Audio content dict with 'audio_path' key + content_type: Type of modal content ("audio") + file_path: Source file path for attribution + entity_name: Optional entity name + item_info: Item info for context + batch_mode: Whether in batch processing mode + doc_id: Document ID + chunk_order_index: Chunk ordering index + + Returns: + Tuple of (chunk_text, entity_info) + """ + try: + # Generate transcription and entity info + transcription, entity_info = await self.generate_description_only( + modal_content, content_type, item_info, entity_name + ) + + # Parse audio path for chunk formatting + if isinstance(modal_content, str): + try: + content_data = json.loads(modal_content) + except json.JSONDecodeError: + content_data = {"audio_path": modal_content} + else: + content_data = modal_content + + audio_path = content_data.get("audio_path") or content_data.get( + "img_path", "" + ) + + # Build audio chunk text + modal_chunk = ( + f"[Audio Content]\n" + f"Source: {audio_path}\n" + f"Entity: {entity_info['entity_name']}\n" + f"Transcription:\n{transcription}" + ) + + return await self._create_entity_and_chunk( + modal_chunk, + entity_info, + file_path, + batch_mode, + doc_id, + chunk_order_index, + ) + + except Exception as e: + logger.error(f"Error processing audio content: {e}") + fallback_entity = { + "entity_name": entity_name + if entity_name + else f"audio_{compute_mdhash_id(str(modal_content))}", + "entity_type": "audio", + "summary": f"Audio content: {str(modal_content)[:100]}", + } + return str(modal_content), fallback_entity diff --git a/raganything/processor.py b/raganything/processor.py index add0de017..bd727b183 100644 --- a/raganything/processor.py +++ b/raganything/processor.py @@ -1172,6 +1172,15 @@ def _apply_chunk_template( enhanced_caption=description, ) + elif content_type == "audio": + audio_path = original_item.get("audio_path", original_item.get("img_path", "")) + + return ( + f"[Audio Content]\n" + f"Source: {audio_path}\n" + f"Transcription:\n{description}" + ) + else: # generic or unknown types content = str(original_item.get("content", original_item)) diff --git a/tests/test_audio_processor.py b/tests/test_audio_processor.py new file mode 100644 index 000000000..35572cd5a --- /dev/null +++ b/tests/test_audio_processor.py @@ -0,0 +1,202 @@ +"""Tests for the AudioModalProcessor.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from raganything.modalprocessors_audio import is_audio_file + +# AudioModalProcessor requires faster-whisper for full functionality, +# but we can test it with mocked whisper model +try: + import faster_whisper # noqa: F401 + + HAS_FASTER_WHISPER = True +except ImportError: + HAS_FASTER_WHISPER = False + +from raganything.modalprocessors_audio import AudioModalProcessor + + +class TestIsAudioFile: + """Test audio file detection.""" + + def test_supported_extensions(self): + assert is_audio_file("meeting.mp3") + assert is_audio_file("recording.wav") + assert is_audio_file("podcast.flac") + assert is_audio_file("voice.m4a") + assert is_audio_file("music.ogg") + assert is_audio_file("/path/to/file.WAV") # case insensitive + assert is_audio_file("call.aac") + assert is_audio_file("audio.opus") + + def test_unsupported_extensions(self): + assert not is_audio_file("document.pdf") + assert not is_audio_file("image.png") + assert not is_audio_file("video.mp4") + assert not is_audio_file("text.txt") + assert not is_audio_file("no_extension") + + +class TestAudioModalProcessorInit: + """Test AudioModalProcessor initialization.""" + + def test_default_init(self): + lightrag = MagicMock() + lightrag.tokenizer = None + caption_func = AsyncMock() + + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=caption_func, + ) + assert processor.whisper_model_name == "base" + assert processor.whisper_device == "auto" + assert processor.language is None + + def test_custom_model(self): + lightrag = MagicMock() + lightrag.tokenizer = None + caption_func = AsyncMock() + + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=caption_func, + whisper_model="large-v3", + language="zh", + ) + assert processor.whisper_model_name == "large-v3" + assert processor.language == "zh" + + def test_env_override(self, monkeypatch): + monkeypatch.setenv("WHISPER_MODEL", "medium") + monkeypatch.setenv("WHISPER_LANGUAGE", "en") + + lightrag = MagicMock() + lightrag.tokenizer = None + caption_func = AsyncMock() + + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=caption_func, + ) + assert processor.whisper_model_name == "medium" + assert processor.language == "en" + + +class TestFormatTimestamp: + """Test timestamp formatting.""" + + def setup_method(self): + lightrag = MagicMock() + lightrag.tokenizer = None + self.processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=AsyncMock(), + ) + + def test_seconds_only(self): + assert self.processor._format_timestamp(45) == "0:45" + + def test_minutes_and_seconds(self): + assert self.processor._format_timestamp(125) == "2:05" + + def test_hours(self): + assert self.processor._format_timestamp(3661) == "1:01:01" + + def test_zero(self): + assert self.processor._format_timestamp(0) == "0:00" + + +class TestSegmentsToText: + """Test segment formatting.""" + + def setup_method(self): + lightrag = MagicMock() + lightrag.tokenizer = None + self.processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=AsyncMock(), + ) + + def test_single_segment(self): + segments = [{"start": 0, "end": 30, "text": "Hello world"}] + result = self.processor._segments_to_text(segments) + assert result == "[0:00-0:30] Hello world" + + def test_multiple_segments(self): + segments = [ + {"start": 0, "end": 30, "text": "First segment"}, + {"start": 30, "end": 65, "text": "Second segment"}, + ] + result = self.processor._segments_to_text(segments) + assert "[0:00-0:30] First segment" in result + assert "[0:30-1:05] Second segment" in result + + def test_empty_segments(self): + result = self.processor._segments_to_text([]) + assert result == "" + + +@pytest.mark.asyncio +class TestGenerateDescriptionOnly: + """Test the generate_description_only method.""" + + async def test_file_not_found(self): + lightrag = MagicMock() + lightrag.tokenizer = None + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=AsyncMock(), + ) + + # Should return fallback on missing file + result, entity_info = await processor.generate_description_only( + {"audio_path": "/nonexistent/file.mp3"}, + "audio", + ) + assert entity_info["entity_type"] == "audio" + + async def test_with_dict_content(self): + lightrag = MagicMock() + lightrag.tokenizer = None + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=AsyncMock(), + ) + + # Mock transcribe to avoid needing actual audio + mock_segments = [ + {"start": 0, "end": 10, "text": "This is a test transcription"}, + {"start": 10, "end": 20, "text": "Second part of the audio"}, + ] + processor.transcribe = MagicMock(return_value=mock_segments) + + result, entity_info = await processor.generate_description_only( + {"audio_path": "/tmp/test.mp3"}, + "audio", + ) + + assert "[0:00-0:10] This is a test transcription" in result + assert "[0:10-0:20] Second part of the audio" in result + assert entity_info["entity_type"] == "audio" + assert "audio_test" in entity_info["entity_name"] + + async def test_with_string_content(self): + lightrag = MagicMock() + lightrag.tokenizer = None + processor = AudioModalProcessor( + lightrag=lightrag, + modal_caption_func=AsyncMock(), + ) + + mock_segments = [{"start": 0, "end": 5, "text": "Hello"}] + processor.transcribe = MagicMock(return_value=mock_segments) + + # Pass path as string directly + result, entity_info = await processor.generate_description_only( + "/tmp/test.mp3", + "audio", + ) + assert "[0:00-0:05] Hello" in result