feat: transcription fix + SLA write-back + real-time supervisor events

- Deepgram: multichannel=true + language=multi (captures both speakers, multilingual) - LLM speaker identification (agent vs customer from conversational cues) - Removed summarize=v2 (incompatible with multilingual) - SLA computation on call creation (lead.createdAt → call.startedAt elapsed %) - WebSocket: supervisor room + call:created broadcast for real-time updates - Maint: clear-analysis-cache endpoint + scanKeys/deleteCache on SessionService - AI chat: rules-engine context routing with dedicated system prompt Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-18 20:08:19 +00:00 · 2026-04-01 16:59:23 +05:30
parent b8556cf440
commit 5e3ccbd040
8 changed files with 461 additions and 33 deletions
--- a/src/recordings/recordings.service.ts
+++ b/src/recordings/recordings.service.ts
@@ -57,10 +57,10 @@ export class RecordingsService {
        // Step 1: Send to Deepgram pre-recorded API with diarization + sentiment
        const dgResponse = await fetch(DEEPGRAM_API + '?' + new URLSearchParams({
            model: 'nova-2',
-            language: 'en',
+            language: 'multi',
            smart_format: 'true',
            diarize: 'true',
-            summarize: 'v2',
+            multichannel: 'true',
            topics: 'true',
            sentiment: 'true',
            utterances: 'true',
@@ -82,9 +82,9 @@ export class RecordingsService {
        const dgData = await dgResponse.json();
        const results = dgData.results;

-        // Extract utterances (speaker-labeled segments)
+        // Extract utterances (channel-labeled for multichannel, speaker-labeled otherwise)
        const utterances: TranscriptUtterance[] = (results?.utterances ?? []).map((u: any) => ({
-            speaker: u.speaker ?? 0,
+            speaker: u.channel ?? u.speaker ?? 0,
            start: u.start ?? 0,
            end: u.end ?? 0,
            text: u.transcript ?? '',
@@ -106,14 +106,27 @@ export class RecordingsService {
            ? results.channels[0].alternatives[0].words.slice(-1)[0].end
            : 0;

-        // Step 2: Full transcript text for AI analysis
-        const fullTranscript = utterances.map(u =>
-            `Speaker ${u.speaker === 0 ? 'Agent' : 'Customer'}: ${u.text}`,
+        // Step 2: Build raw transcript with channel labels for AI to identify roles
+        const rawTranscript = utterances.map(u =>
+            `Channel ${u.speaker}: ${u.text}`,
        ).join('\n');

        this.logger.log(`[RECORDING] Transcribed: ${utterances.length} utterances, ${Math.round(duration)}s`);

-        // Step 3: AI insights
+        // Step 3: Ask AI to identify agent vs customer, then generate insights
+        const speakerMap = await this.identifySpeakers(rawTranscript);
+        const fullTranscript = utterances.map(u =>
+            `${speakerMap[u.speaker] ?? `Speaker ${u.speaker}`}: ${u.text}`,
+        ).join('\n');
+
+        // Remap utterance speaker labels for the frontend
+        for (const u of utterances) {
+            // 0 = agent, 1 = customer in the returned data
+            const role = speakerMap[u.speaker];
+            if (role === 'Agent') u.speaker = 0;
+            else if (role === 'Customer') u.speaker = 1;
+        }
+
        const insights = await this.generateInsights(fullTranscript, summary, topics);

        return {
@@ -126,6 +139,45 @@ export class RecordingsService {
        };
    }

+    private async identifySpeakers(rawTranscript: string): Promise<Record<number, string>> {
+        if (!this.aiModel || !rawTranscript.trim()) {
+            return { 0: 'Customer', 1: 'Agent' };
+        }
+
+        try {
+            const { object } = await generateObject({
+                model: this.aiModel,
+                schema: z.object({
+                    agentChannel: z.number().describe('The channel number (0 or 1) that is the call center agent'),
+                    reasoning: z.string().describe('Brief explanation of how you identified the agent'),
+                }),
+                system: `You are analyzing a hospital call center recording transcript.
+Each line is labeled with a channel number. One channel is the call center agent, the other is the customer/patient.
+
+The AGENT typically:
+- Greets professionally ("Hello, Global Hospital", "How can I help you?")
+- Asks for patient details (name, phone, department)
+- Provides information about doctors, schedules, services
+- Navigates systems, puts on hold, transfers calls
+
+The CUSTOMER typically:
+- Asks questions about appointments, doctors, services
+- Provides personal details when asked
+- Describes symptoms or reasons for calling`,
+                prompt: rawTranscript,
+                maxOutputTokens: 100,
+            });
+
+            const agentCh = object.agentChannel;
+            const customerCh = agentCh === 0 ? 1 : 0;
+            this.logger.log(`[RECORDING] Speaker ID: agent=Ch${agentCh}, customer=Ch${customerCh} (${object.reasoning})`);
+            return { [agentCh]: 'Agent', [customerCh]: 'Customer' };
+        } catch (err) {
+            this.logger.warn(`[RECORDING] Speaker identification failed: ${err}`);
+            return { 0: 'Customer', 1: 'Agent' };
+        }
+    }
+
    private computeAverageSentiment(segments: any[]): { label: 'positive' | 'neutral' | 'negative' | 'mixed'; score: number } {
        if (!segments?.length) return { label: 'neutral', score: 0 };