"""
Voice Agent Orchestrator
Handles the complete voice interaction pipeline with optimized latency
"""

import asyncio
import json
import time
from typing import Optional, Dict, Any, List, AsyncGenerator
from dataclasses import dataclass
from enum import Enum
import numpy as np

class ConversationState(Enum):
    LISTENING = "listening"
    PROCESSING = "processing"
    SPEAKING = "speaking"
    WAITING = "waiting"

@dataclass
class AgentConfig:
    """Configuration for voice agent"""
    language: str = "swahili"
    stt_model: str = "whisper-medium"
    tts_model: str = "chatterbox-multilingual"
    llm_model: str = "claude-sonnet-4"
    use_fillers: bool = True
    filler_threshold_ms: int = 800  # Play filler if processing >800ms
    max_latency_ms: int = 1500
    vad_threshold: float = 0.5
    interrupt_enabled: bool = True

class VoiceAgentOrchestrator:
    """
    Main orchestrator for voice agent with optimized latency management
    """
    
    def __init__(self, config: AgentConfig):
        self.config = config
        self.state = ConversationState.LISTENING
        self.conversation_history = []
        self.context = {}
        
        # Filler phrases for natural conversation flow
        self.fillers = {
            "swahili": {
                "thinking": [
                    "Subiri kidogo...",
                    "Ngoja niangalie...",
                    "Ninachunguza taarifa zako...",
                ],
                "fetching": [
                    "Ninapata taarifa...",
                    "Naangalia rekodi zako...",
                    "Ninakagua akaunti yako...",
                ],
                "processing": [
                    "Sawa, ninafanya hesabu...",
                    "Nachambua hii...",
                    "Ninahakikisha...",
                ],
                "confirming": [
                    "Ndio, nimekuelewa...",
                    "Sawa, naendelea...",
                ]
            },
            "english": {
                "thinking": [
                    "One moment please...",
                    "Let me check that...",
                    "I'm looking that up...",
                ],
                "fetching": [
                    "Retrieving your information...",
                    "Checking your account...",
                    "Getting your details...",
                ],
                "processing": [
                    "Okay, calculating...",
                    "Processing that...",
                    "Let me verify...",
                ],
                "confirming": [
                    "Yes, I understand...",
                    "Okay, continuing...",
                ]
            }
        }
        
    async def process_audio_stream(
        self, 
        audio_stream: AsyncGenerator[bytes, None],
        user_context: Dict[str, Any]
    ) -> AsyncGenerator[bytes, None]:
        """
        Main processing pipeline with streaming and parallelization
        
        Args:
            audio_stream: Incoming audio chunks
            user_context: User profile, loan info, etc.
            
        Yields:
            Audio response chunks
        """
        self.context = user_context
        
        # VAD and audio buffer
        vad_buffer = []
        speech_detected = False
        
        async for audio_chunk in audio_stream:
            # Step 1: Voice Activity Detection (parallel with buffering)
            vad_task = asyncio.create_task(
                self._detect_voice_activity(audio_chunk)
            )
            
            # Buffer audio while checking VAD
            vad_buffer.append(audio_chunk)
            
            is_speech = await vad_task
            
            if is_speech:
                speech_detected = True
                continue
            elif speech_detected and not is_speech:
                # End of speech detected
                speech_detected = False
                
                # Process the complete utterance
                async for response_chunk in self._process_utterance(vad_buffer):
                    yield response_chunk
                
                # Clear buffer
                vad_buffer = []
    
    async def _process_utterance(
        self, 
        audio_buffer: List[bytes]
    ) -> AsyncGenerator[bytes, None]:
        """
        Process complete user utterance with optimized latency
        """
        start_time = time.time()
        
        # Combine audio buffer
        audio_data = b''.join(audio_buffer)
        
        # Parallel processing: Start all tasks simultaneously
        tasks = {
            'stt': asyncio.create_task(self._speech_to_text(audio_data)),
            'filler_prep': asyncio.create_task(self._prepare_filler("thinking")),
            'context_prep': asyncio.create_task(self._prepare_context())
        }
        
        # Wait for STT (fastest, ~300ms)
        user_text = await tasks['stt']
        stt_latency = time.time() - start_time
        
        # Add to conversation history
        self.conversation_history.append({
            "role": "user",
            "content": user_text
        })
        
        # Determine if we need a filler
        intent_type = self._classify_intent(user_text)
        estimated_processing_time = self._estimate_processing_time(intent_type)
        
        filler_task = None
        if estimated_processing_time > self.config.filler_threshold_ms:
            # Play filler while processing
            filler_audio = await tasks['filler_prep']
            filler_task = asyncio.create_task(self._stream_audio(filler_audio))
        
        # Process with LLM (may include tool calls)
        llm_start = time.time()
        response_text, tool_calls = await self._process_with_llm(
            user_text, 
            intent_type
        )
        llm_latency = time.time() - llm_start
        
        # Wait for filler to finish if playing
        if filler_task:
            await filler_task
        
        # Stream TTS response
        tts_start = time.time()
        async for audio_chunk in self._text_to_speech_stream(response_text):
            yield audio_chunk
        
        tts_latency = time.time() - tts_start
        
        # Log latency metrics
        total_latency = time.time() - start_time
        self._log_latency({
            'stt': stt_latency * 1000,
            'llm': llm_latency * 1000,
            'tts': tts_latency * 1000,
            'total': total_latency * 1000
        })
        
        # Add to conversation history
        self.conversation_history.append({
            "role": "assistant",
            "content": response_text,
            "tool_calls": tool_calls
        })
    
    async def _detect_voice_activity(self, audio_chunk: bytes) -> bool:
        """
        Voice Activity Detection using Silero VAD
        Latency: ~5-10ms
        """
        # Placeholder - integrate actual VAD
        # Using Silero VAD in production
        await asyncio.sleep(0.01)  # Simulate VAD processing
        return True  # Simplified
    
    async def _speech_to_text(self, audio_data: bytes) -> str:
        """
        Convert speech to text using Whisper
        Target latency: 200-400ms for medium model
        """
        # In production, use actual Whisper model
        # Can use Whisper.cpp for faster inference
        await asyncio.sleep(0.3)  # Simulate STT processing
        return "Nataka kujua kiasi cha mkopo wangu"  # Example
    
    async def _process_with_llm(
        self, 
        user_text: str, 
        intent_type: str
    ) -> tuple[str, List[Dict]]:
        """
        Process user input with LLM and execute tool calls
        Target latency: 500-1000ms
        """
        # Prepare system prompt with user context
        system_prompt = self._build_system_prompt()
        
        # Check if we need to call tools
        tool_calls = []
        if intent_type in ["loan_balance", "payment_status", "loan_history"]:
            # Execute tool calls in parallel
            tool_results = await self._execute_tools(intent_type)
            tool_calls = tool_results
        
        # Get LLM response (streaming possible)
        # Using Claude API with streaming
        response = await self._call_llm_api(
            system_prompt=system_prompt,
            user_message=user_text,
            tool_results=tool_calls
        )
        
        return response, tool_calls
    
    async def _execute_tools(self, intent_type: str) -> List[Dict]:
        """
        Execute relevant API calls based on intent
        Parallel execution for multiple calls
        """
        tasks = []
        
        if intent_type == "loan_balance":
            tasks.append(
                asyncio.create_task(
                    self._call_api("/api/v1/loans/active", 
                                   {"phone": self.context.get("phone")})
                )
            )
        elif intent_type == "payment_status":
            tasks.extend([
                asyncio.create_task(
                    self._call_api("/api/v1/payments/recent",
                                   {"phone": self.context.get("phone")})
                ),
                asyncio.create_task(
                    self._call_api("/api/v1/loans/active",
                                   {"phone": self.context.get("phone")})
                )
            ])
        
        # Execute all API calls in parallel
        results = await asyncio.gather(*tasks)
        return results
    
    async def _text_to_speech_stream(
        self, 
        text: str
    ) -> AsyncGenerator[bytes, None]:
        """
        Convert text to speech with streaming
        Target latency: 200ms for first chunk
        """
        # In production, use Chatterbox or similar
        # Stream audio chunks as they're generated
        
        # Simulate streaming TTS
        chunks = self._split_for_streaming(text)
        for chunk in chunks:
            await asyncio.sleep(0.05)  # Simulate chunk generation
            yield chunk.encode()  # Placeholder audio data
    
    def _classify_intent(self, text: str) -> str:
        """
        Quick intent classification for latency optimization
        """
        text_lower = text.lower()
        
        if any(word in text_lower for word in ['kiasi', 'balance', 'mkopo']):
            return "loan_balance"
        elif any(word in text_lower for word in ['malipo', 'payment', 'kulipa']):
            return "payment_status"
        elif any(word in text_lower for word in ['historia', 'history', 'zamani']):
            return "loan_history"
        else:
            return "general"
    
    def _estimate_processing_time(self, intent_type: str) -> int:
        """
        Estimate processing time based on intent
        Returns: milliseconds
        """
        estimates = {
            "loan_balance": 1200,      # Needs API call
            "payment_status": 1500,    # Needs multiple API calls
            "loan_history": 2000,      # Complex query
            "general": 600             # Simple response
        }
        return estimates.get(intent_type, 800)
    
    async def _prepare_filler(self, filler_type: str) -> bytes:
        """
        Pre-generate filler audio for instant playback
        """
        import random
        filler_text = random.choice(
            self.fillers[self.config.language][filler_type]
        )
        
        # Pre-generate TTS for filler (cache these)
        # In production, pre-cache common fillers
        await asyncio.sleep(0.1)  # Simulate TTS
        return filler_text.encode()  # Placeholder
    
    async def _prepare_context(self) -> Dict:
        """
        Prepare relevant context for faster LLM processing
        """
        return {
            "user_profile": self.context,
            "recent_history": self.conversation_history[-5:]
        }
    
    def _build_system_prompt(self) -> str:
        """
        Build system prompt with user context
        """
        return f"""You are a helpful loan assistant for Credable, a fintech company in East Africa.

User Context:
- Phone: {self.context.get('phone')}
- Name: {self.context.get('name')}
- Language: {self.config.language}

You can:
1. Check loan balances
2. View payment history
3. Provide loan information
4. Schedule payments

Be concise and helpful. Speak naturally in {self.config.language}.
"""
    
    async def _call_llm_api(
        self, 
        system_prompt: str, 
        user_message: str,
        tool_results: List[Dict]
    ) -> str:
        """
        Call LLM API (Claude, GPT, etc.)
        """
        # Placeholder for actual API call
        await asyncio.sleep(0.5)
        
        if tool_results:
            return f"Kiasi chako cha mkopo ni TZS 500,000. Una deni la TZS 150,000."
        return "Karibu! Ninaweza kukusaidia aje leo?"
    
    async def _call_api(self, endpoint: str, params: Dict) -> Dict:
        """
        Call mock/real API
        """
        # Placeholder for actual API calls
        await asyncio.sleep(0.2)
        return {"status": "success", "data": {}}
    
    async def _stream_audio(self, audio_data: bytes):
        """
        Stream audio chunks
        """
        await asyncio.sleep(0.1)  # Simulate streaming
    
    def _split_for_streaming(self, text: str) -> List[str]:
        """
        Split text into chunks for streaming TTS
        """
        # Split on sentence boundaries
        import re
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _log_latency(self, metrics: Dict[str, float]):
        """
        Log latency metrics for monitoring
        """
        print(f"Latency Metrics:")
        print(f"  STT: {metrics['stt']:.0f}ms")
        print(f"  LLM: {metrics['llm']:.0f}ms")
        print(f"  TTS: {metrics['tts']:.0f}ms")
        print(f"  Total: {metrics['total']:.0f}ms")
        
        if metrics['total'] > self.config.max_latency_ms:
            print(f"⚠️ Latency exceeded target: {metrics['total']:.0f}ms > {self.config.max_latency_ms}ms")


# Example usage
async def main():
    config = AgentConfig(
        language="swahili",
        use_fillers=True,
        filler_threshold_ms=800
    )
    
    orchestrator = VoiceAgentOrchestrator(config)
    
    # Simulate user context
    user_context = {
        "phone": "+255123456789",
        "name": "John Doe",
        "customer_id": "CUST001"
    }
    
    print("Voice Agent Ready!")
    print(f"Language: {config.language}")
    print(f"Max Latency Target: {config.max_latency_ms}ms")
    print("-" * 50)


if __name__ == "__main__":
    asyncio.run(main())