From ca75f9746b762918bbbecd1d78b09226c7747002 Mon Sep 17 00:00:00 2001
From: array <array@noreply.github.com>
Date: Fri, 22 May 2026 16:41:50 +0800
Subject: [PATCH] feat: add AudioModalProcessor for speech-to-text
 transcription

Add support for audio file processing (MP3, WAV, FLAC, M4A, OGG, etc.)
using faster-whisper for local ASR transcription.

Key features:
- Timestamped transcription output for precise retrieval
- VAD filtering to skip silence
- Lazy model loading (only loads whisper when first audio is processed)
- Configurable via WHISPER_MODEL and WHISPER_LANGUAGE env vars
- Added as optional dependency: pip install raganything[audio]

Use cases: meeting recordings, phone calls, podcasts, lectures.
---
 env.example                          |   6 +-
 pyproject.toml                       |   2 +
 raganything/__init__.py              |  18 ++
 raganything/config.py                |   2 +-
 raganything/modalprocessors_audio.py | 323 +++++++++++++++++++++++++++
 raganything/processor.py             |   9 +
 tests/test_audio_processor.py        | 202 +++++++++++++++++
 7 files changed, 560 insertions(+), 2 deletions(-)
 create mode 100644 raganything/modalprocessors_audio.py
 create mode 100644 tests/test_audio_processor.py

diff --git a/env.example b/env.example
index 84d774970..a1f27781a 100644
--- a/env.example
+++ b/env.example
@@ -50,9 +50,13 @@ OLLAMA_EMULATING_MODEL_TAG=latest
 # ENABLE_TABLE_PROCESSING=true
 # ENABLE_EQUATION_PROCESSING=true
 
+### Audio Processing Configuration (requires: pip install raganything[audio])
+# WHISPER_MODEL=base               # tiny/base/small/medium/large-v3
+# WHISPER_LANGUAGE=                 # Auto-detect if empty. Set to "zh", "en", etc.
+
 ### Batch Processing Configuration
 # MAX_CONCURRENT_FILES=1
-# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md
+# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md,.mp3,.wav,.flac,.m4a,.ogg
 # RECURSIVE_FOLDER_PROCESSING=true
 
 ### Context Extraction Configuration
diff --git a/pyproject.toml b/pyproject.toml
index e612fa4e4..7d81cd239 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ markdown = [
     "weasyprint>=60.0",
     "pygments>=2.10.0",
 ]
+audio = ["faster-whisper>=1.0.0"]
 all = [
     "Pillow>=10.0.0",
     "reportlab>=4.0.0",
@@ -48,6 +49,7 @@ all = [
     "markdown>=3.4.0",
     "weasyprint>=60.0",
     "pygments>=2.10.0",
+    "faster-whisper>=1.0.0",
 ]
 
 [project.urls]
diff --git a/raganything/__init__.py b/raganything/__init__.py
index fa8efb10c..85b0cf538 100644
--- a/raganything/__init__.py
+++ b/raganything/__init__.py
@@ -43,6 +43,16 @@
 except ImportError:
     pass
 
+# Optional: audio modal processor (requires faster-whisper).
+try:
+    from .modalprocessors_audio import (
+        AudioModalProcessor as AudioModalProcessor,
+        is_audio_file as is_audio_file,
+    )
+except ImportError:
+    # faster-whisper not installed; audio processing unavailable.
+    pass
+
 # Optional: multilingual prompt manager.
 try:
     from .prompt_manager import (
@@ -97,6 +107,14 @@
         ]
     )
 
+if "AudioModalProcessor" in globals():
+    __all__.extend(
+        [
+            "AudioModalProcessor",
+            "is_audio_file",
+        ]
+    )
+
 if "set_prompt_language" in globals():
     __all__.extend(
         [
diff --git a/raganything/config.py b/raganything/config.py
index c1969b396..1d07e1ff2 100644
--- a/raganything/config.py
+++ b/raganything/config.py
@@ -63,7 +63,7 @@ class RAGAnythingConfig:
             x.strip()
             for x in get_env_value(
                 "SUPPORTED_FILE_EXTENSIONS",
-                ".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md",
+                ".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md,.mp3,.wav,.flac,.m4a,.ogg,.wma,.aac,.opus",
                 str,
             ).split(",")
         ]
diff --git a/raganything/modalprocessors_audio.py b/raganything/modalprocessors_audio.py
new file mode 100644
index 000000000..22e1b4bf1
--- /dev/null
+++ b/raganything/modalprocessors_audio.py
@@ -0,0 +1,323 @@
+"""
+Audio Modal Processor for RAG-Anything
+
+Processes audio files (MP3, WAV, FLAC, M4A, OGG) by transcribing speech to text
+using faster-whisper, then feeding the transcribed text into LightRAG's knowledge graph.
+
+Supports:
+- Speech-to-text transcription with timestamps
+- Meeting recordings, phone calls, podcasts, lectures
+- Multiple languages (auto-detect or specify)
+
+Dependencies:
+    pip install raganything[audio]
+    # or: pip install faster-whisper
+"""
+
+import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+from lightrag.utils import compute_mdhash_id
+
+from .modalprocessors import BaseModalProcessor
+from .prompt import PROMPTS
+
+logger = logging.getLogger(__name__)
+
+# Supported audio file extensions
+AUDIO_EXTENSIONS = {".mp3", ".wav", ".flac", ".m4a", ".ogg", ".wma", ".aac", ".opus"}
+
+
+def is_audio_file(file_path: str) -> bool:
+    """Check if a file is a supported audio format."""
+    return Path(file_path).suffix.lower() in AUDIO_EXTENSIONS
+
+
+class AudioModalProcessor(BaseModalProcessor):
+    """Processor for audio content using faster-whisper for transcription.
+
+    Transcribes audio files into timestamped text segments, then processes
+    them through LightRAG for knowledge graph construction and retrieval.
+
+    Suitable for:
+    - Meeting recordings
+    - Phone call recordings
+    - Podcasts and interviews
+    - Lectures and presentations
+    - Voice memos
+
+    Example:
+        >>> processor = AudioModalProcessor(
+        ...     lightrag=rag_instance,
+        ...     modal_caption_func=caption_func,
+        ...     whisper_model="large-v3",
+        ... )
+        >>> result = await processor.process_multimodal_content(
+        ...     modal_content={"audio_path": "/path/to/meeting.mp3"},
+        ...     content_type="audio",
+        ... )
+    """
+
+    def __init__(
+        self,
+        lightrag,
+        modal_caption_func,
+        context_extractor=None,
+        whisper_model: str = None,
+        whisper_device: str = "auto",
+        whisper_compute_type: str = "auto",
+        language: str = None,
+        segment_min_length: int = 30,
+    ):
+        """Initialize audio processor.
+
+        Args:
+            lightrag: LightRAG instance
+            modal_caption_func: Function for generating descriptions
+            context_extractor: Context extractor instance
+            whisper_model: Whisper model size (tiny/base/small/medium/large-v3)
+                          Defaults to env WHISPER_MODEL or "base"
+            whisper_device: Device for inference ("auto", "cpu", "cuda")
+            whisper_compute_type: Compute type ("auto", "float16", "int8")
+            language: Language code (e.g., "zh", "en"). None for auto-detect.
+            segment_min_length: Minimum segment length in characters to keep
+        """
+        super().__init__(lightrag, modal_caption_func, context_extractor)
+
+        self.whisper_model_name = whisper_model or os.environ.get(
+            "WHISPER_MODEL", "base"
+        )
+        self.whisper_device = whisper_device
+        self.whisper_compute_type = whisper_compute_type
+        self.language = language or os.environ.get("WHISPER_LANGUAGE", None)
+        self.segment_min_length = segment_min_length
+        self._whisper_model = None
+
+    @property
+    def whisper(self):
+        """Lazy-load whisper model on first use."""
+        if self._whisper_model is None:
+            try:
+                from faster_whisper import WhisperModel
+            except ImportError:
+                raise ImportError(
+                    "faster-whisper is required for audio processing. "
+                    "Install it with: pip install raganything[audio] "
+                    "or: pip install faster-whisper"
+                )
+
+            logger.info(
+                f"Loading whisper model: {self.whisper_model_name} "
+                f"(device={self.whisper_device})"
+            )
+            self._whisper_model = WhisperModel(
+                self.whisper_model_name,
+                device=self.whisper_device,
+                compute_type=self.whisper_compute_type,
+            )
+        return self._whisper_model
+
+    def transcribe(self, audio_path: str) -> List[Dict[str, Any]]:
+        """Transcribe audio file to timestamped segments.
+
+        Args:
+            audio_path: Path to the audio file
+
+        Returns:
+            List of segments with start, end (seconds) and text
+        """
+        if not Path(audio_path).exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+
+        logger.info(f"Transcribing audio: {audio_path}")
+
+        segments_iter, info = self.whisper.transcribe(
+            audio_path,
+            language=self.language,
+            vad_filter=True,  # Filter out silence
+            vad_parameters=dict(min_silence_duration_ms=500),
+        )
+
+        segments = []
+        for segment in segments_iter:
+            text = segment.text.strip()
+            if len(text) >= self.segment_min_length:
+                segments.append(
+                    {
+                        "start": segment.start,
+                        "end": segment.end,
+                        "text": text,
+                    }
+                )
+
+        logger.info(
+            f"Transcription complete: {len(segments)} segments, "
+            f"language={info.language}, duration={info.duration:.1f}s"
+        )
+        return segments
+
+    def _format_timestamp(self, seconds: float) -> str:
+        """Format seconds to HH:MM:SS or MM:SS."""
+        h = int(seconds // 3600)
+        m = int((seconds % 3600) // 60)
+        s = int(seconds % 60)
+        if h > 0:
+            return f"{h}:{m:02d}:{s:02d}"
+        return f"{m}:{s:02d}"
+
+    def _segments_to_text(self, segments: List[Dict[str, Any]]) -> str:
+        """Convert transcription segments to formatted text with timestamps."""
+        lines = []
+        for seg in segments:
+            start_str = self._format_timestamp(seg["start"])
+            end_str = self._format_timestamp(seg["end"])
+            lines.append(f"[{start_str}-{end_str}] {seg['text']}")
+        return "\n".join(lines)
+
+    async def generate_description_only(
+        self,
+        modal_content,
+        content_type: str,
+        item_info: Dict[str, Any] = None,
+        entity_name: str = None,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """Generate audio transcription and entity info.
+
+        Args:
+            modal_content: Audio content dict with 'audio_path' key
+            content_type: Type of modal content ("audio")
+            item_info: Item information for context extraction
+            entity_name: Optional predefined entity name
+
+        Returns:
+            Tuple of (transcription_text, entity_info)
+        """
+        try:
+            # Parse audio content
+            if isinstance(modal_content, str):
+                try:
+                    content_data = json.loads(modal_content)
+                except json.JSONDecodeError:
+                    content_data = {"audio_path": modal_content}
+            else:
+                content_data = modal_content
+
+            audio_path = content_data.get("audio_path") or content_data.get("img_path")
+            if not audio_path:
+                raise ValueError(
+                    f"No audio path provided in modal_content: {modal_content}"
+                )
+
+            # Transcribe
+            segments = self.transcribe(audio_path)
+            if not segments:
+                raise RuntimeError(f"No speech detected in audio: {audio_path}")
+
+            # Format transcription
+            transcription = self._segments_to_text(segments)
+
+            # Generate entity info
+            filename = Path(audio_path).stem
+            duration = segments[-1]["end"] if segments else 0
+            entity_info = {
+                "entity_name": entity_name
+                if entity_name
+                else f"audio_{filename}",
+                "entity_type": "audio",
+                "summary": (
+                    f"Audio recording ({self._format_timestamp(duration)} duration). "
+                    f"Transcription: {segments[0]['text'][:100]}..."
+                    if segments
+                    else "Empty audio"
+                ),
+            }
+
+            return transcription, entity_info
+
+        except Exception as e:
+            logger.error(f"Error generating audio transcription: {e}")
+            fallback_entity = {
+                "entity_name": entity_name
+                if entity_name
+                else f"audio_{compute_mdhash_id(str(modal_content))}",
+                "entity_type": "audio",
+                "summary": f"Audio content: {str(modal_content)[:100]}",
+            }
+            return str(modal_content), fallback_entity
+
+    async def process_multimodal_content(
+        self,
+        modal_content,
+        content_type: str,
+        file_path: str = "manual_creation",
+        entity_name: str = None,
+        item_info: Dict[str, Any] = None,
+        batch_mode: bool = False,
+        doc_id: str = None,
+        chunk_order_index: int = 0,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """Process audio content: transcribe and insert into knowledge graph.
+
+        Args:
+            modal_content: Audio content dict with 'audio_path' key
+            content_type: Type of modal content ("audio")
+            file_path: Source file path for attribution
+            entity_name: Optional entity name
+            item_info: Item info for context
+            batch_mode: Whether in batch processing mode
+            doc_id: Document ID
+            chunk_order_index: Chunk ordering index
+
+        Returns:
+            Tuple of (chunk_text, entity_info)
+        """
+        try:
+            # Generate transcription and entity info
+            transcription, entity_info = await self.generate_description_only(
+                modal_content, content_type, item_info, entity_name
+            )
+
+            # Parse audio path for chunk formatting
+            if isinstance(modal_content, str):
+                try:
+                    content_data = json.loads(modal_content)
+                except json.JSONDecodeError:
+                    content_data = {"audio_path": modal_content}
+            else:
+                content_data = modal_content
+
+            audio_path = content_data.get("audio_path") or content_data.get(
+                "img_path", ""
+            )
+
+            # Build audio chunk text
+            modal_chunk = (
+                f"[Audio Content]\n"
+                f"Source: {audio_path}\n"
+                f"Entity: {entity_info['entity_name']}\n"
+                f"Transcription:\n{transcription}"
+            )
+
+            return await self._create_entity_and_chunk(
+                modal_chunk,
+                entity_info,
+                file_path,
+                batch_mode,
+                doc_id,
+                chunk_order_index,
+            )
+
+        except Exception as e:
+            logger.error(f"Error processing audio content: {e}")
+            fallback_entity = {
+                "entity_name": entity_name
+                if entity_name
+                else f"audio_{compute_mdhash_id(str(modal_content))}",
+                "entity_type": "audio",
+                "summary": f"Audio content: {str(modal_content)[:100]}",
+            }
+            return str(modal_content), fallback_entity
diff --git a/raganything/processor.py b/raganything/processor.py
index add0de017..bd727b183 100644
--- a/raganything/processor.py
+++ b/raganything/processor.py
@@ -1172,6 +1172,15 @@ def _apply_chunk_template(
                     enhanced_caption=description,
                 )
 
+            elif content_type == "audio":
+                audio_path = original_item.get("audio_path", original_item.get("img_path", ""))
+
+                return (
+                    f"[Audio Content]\n"
+                    f"Source: {audio_path}\n"
+                    f"Transcription:\n{description}"
+                )
+
             else:  # generic or unknown types
                 content = str(original_item.get("content", original_item))
 
diff --git a/tests/test_audio_processor.py b/tests/test_audio_processor.py
new file mode 100644
index 000000000..35572cd5a
--- /dev/null
+++ b/tests/test_audio_processor.py
@@ -0,0 +1,202 @@
+"""Tests for the AudioModalProcessor."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from raganything.modalprocessors_audio import is_audio_file
+
+# AudioModalProcessor requires faster-whisper for full functionality,
+# but we can test it with mocked whisper model
+try:
+    import faster_whisper  # noqa: F401
+
+    HAS_FASTER_WHISPER = True
+except ImportError:
+    HAS_FASTER_WHISPER = False
+
+from raganything.modalprocessors_audio import AudioModalProcessor
+
+
+class TestIsAudioFile:
+    """Test audio file detection."""
+
+    def test_supported_extensions(self):
+        assert is_audio_file("meeting.mp3")
+        assert is_audio_file("recording.wav")
+        assert is_audio_file("podcast.flac")
+        assert is_audio_file("voice.m4a")
+        assert is_audio_file("music.ogg")
+        assert is_audio_file("/path/to/file.WAV")  # case insensitive
+        assert is_audio_file("call.aac")
+        assert is_audio_file("audio.opus")
+
+    def test_unsupported_extensions(self):
+        assert not is_audio_file("document.pdf")
+        assert not is_audio_file("image.png")
+        assert not is_audio_file("video.mp4")
+        assert not is_audio_file("text.txt")
+        assert not is_audio_file("no_extension")
+
+
+class TestAudioModalProcessorInit:
+    """Test AudioModalProcessor initialization."""
+
+    def test_default_init(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        caption_func = AsyncMock()
+
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=caption_func,
+        )
+        assert processor.whisper_model_name == "base"
+        assert processor.whisper_device == "auto"
+        assert processor.language is None
+
+    def test_custom_model(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        caption_func = AsyncMock()
+
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=caption_func,
+            whisper_model="large-v3",
+            language="zh",
+        )
+        assert processor.whisper_model_name == "large-v3"
+        assert processor.language == "zh"
+
+    def test_env_override(self, monkeypatch):
+        monkeypatch.setenv("WHISPER_MODEL", "medium")
+        monkeypatch.setenv("WHISPER_LANGUAGE", "en")
+
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        caption_func = AsyncMock()
+
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=caption_func,
+        )
+        assert processor.whisper_model_name == "medium"
+        assert processor.language == "en"
+
+
+class TestFormatTimestamp:
+    """Test timestamp formatting."""
+
+    def setup_method(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        self.processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=AsyncMock(),
+        )
+
+    def test_seconds_only(self):
+        assert self.processor._format_timestamp(45) == "0:45"
+
+    def test_minutes_and_seconds(self):
+        assert self.processor._format_timestamp(125) == "2:05"
+
+    def test_hours(self):
+        assert self.processor._format_timestamp(3661) == "1:01:01"
+
+    def test_zero(self):
+        assert self.processor._format_timestamp(0) == "0:00"
+
+
+class TestSegmentsToText:
+    """Test segment formatting."""
+
+    def setup_method(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        self.processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=AsyncMock(),
+        )
+
+    def test_single_segment(self):
+        segments = [{"start": 0, "end": 30, "text": "Hello world"}]
+        result = self.processor._segments_to_text(segments)
+        assert result == "[0:00-0:30] Hello world"
+
+    def test_multiple_segments(self):
+        segments = [
+            {"start": 0, "end": 30, "text": "First segment"},
+            {"start": 30, "end": 65, "text": "Second segment"},
+        ]
+        result = self.processor._segments_to_text(segments)
+        assert "[0:00-0:30] First segment" in result
+        assert "[0:30-1:05] Second segment" in result
+
+    def test_empty_segments(self):
+        result = self.processor._segments_to_text([])
+        assert result == ""
+
+
+@pytest.mark.asyncio
+class TestGenerateDescriptionOnly:
+    """Test the generate_description_only method."""
+
+    async def test_file_not_found(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=AsyncMock(),
+        )
+
+        # Should return fallback on missing file
+        result, entity_info = await processor.generate_description_only(
+            {"audio_path": "/nonexistent/file.mp3"},
+            "audio",
+        )
+        assert entity_info["entity_type"] == "audio"
+
+    async def test_with_dict_content(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=AsyncMock(),
+        )
+
+        # Mock transcribe to avoid needing actual audio
+        mock_segments = [
+            {"start": 0, "end": 10, "text": "This is a test transcription"},
+            {"start": 10, "end": 20, "text": "Second part of the audio"},
+        ]
+        processor.transcribe = MagicMock(return_value=mock_segments)
+
+        result, entity_info = await processor.generate_description_only(
+            {"audio_path": "/tmp/test.mp3"},
+            "audio",
+        )
+
+        assert "[0:00-0:10] This is a test transcription" in result
+        assert "[0:10-0:20] Second part of the audio" in result
+        assert entity_info["entity_type"] == "audio"
+        assert "audio_test" in entity_info["entity_name"]
+
+    async def test_with_string_content(self):
+        lightrag = MagicMock()
+        lightrag.tokenizer = None
+        processor = AudioModalProcessor(
+            lightrag=lightrag,
+            modal_caption_func=AsyncMock(),
+        )
+
+        mock_segments = [{"start": 0, "end": 5, "text": "Hello"}]
+        processor.transcribe = MagicMock(return_value=mock_segments)
+
+        # Pass path as string directly
+        result, entity_info = await processor.generate_description_only(
+            "/tmp/test.mp3",
+            "audio",
+        )
+        assert "[0:00-0:05] Hello" in result