Skip to main content

Python Integration

Connect to Perf Voice Agents from Python for server-side applications, IVR systems, telephony integrations, or testing.

Prerequisites

pip install websocket-client pyaudio
  • websocket-client — WebSocket client library
  • PyAudio — Cross-platform audio I/O (requires PortAudio system library)

Installing PortAudio

PyAudio requires the PortAudio system library:
# macOS
brew install portaudio

# Ubuntu/Debian
sudo apt-get install portaudio19-dev

# Windows (via pip)
pip install pyaudio  # includes prebuilt binaries

Quick Start

import websocket
import json
import base64
import threading
import pyaudio

WS_URL = 'wss://api.withperf.pro/v1/voice/conversation'
API_KEY = 'YOUR_API_KEY'
AGENT_ID = 'YOUR_AGENT_ID'

RATE = 16000
CHUNK = 2048
ready = False

# Audio setup
pa = pyaudio.PyAudio()
mic_stream = pa.open(format=pyaudio.paInt16, channels=1, rate=RATE,
                     input=True, frames_per_buffer=CHUNK)
spk_stream = pa.open(format=pyaudio.paInt16, channels=1, rate=RATE,
                     output=True, frames_per_buffer=CHUNK)

def on_message(ws, message):
    global ready
    data = json.loads(message)
    msg_type = data.get('type', '')

    if msg_type == 'conversation_initiation_metadata':
        ready = True
        conv_id = data.get('conversation_initiation_metadata_event', {}).get('conversation_id')
        print(f'Session started: {conv_id}')

    elif msg_type == 'audio':
        audio_b64 = data.get('audio_event', {}).get('audio_base_64', '')
        if audio_b64:
            spk_stream.write(base64.b64decode(audio_b64))

    elif msg_type == 'agent_response':
        text = data.get('agent_response_event', {}).get('agent_response', '')
        if text:
            print(f'Agent: {text}')

    elif msg_type == 'user_transcript':
        text = data.get('user_transcription_event', {}).get('user_transcript', '')
        if text:
            print(f'You: {text}')

    elif msg_type == 'ping':
        event_id = data.get('ping_event', {}).get('event_id')
        ws.send(json.dumps({'type': 'pong', 'event_id': event_id}))

def send_audio(ws):
    """Stream microphone audio as base64 PCM16 chunks."""
    while ws.sock and ws.sock.connected:
        if not ready:
            continue
        pcm_data = mic_stream.read(CHUNK, exception_on_overflow=False)
        b64 = base64.b64encode(pcm_data).decode('utf-8')
        ws.send(json.dumps({'user_audio_chunk': b64}))

def on_open(ws):
    threading.Thread(target=send_audio, args=(ws,), daemon=True).start()
    print('Connected — speak now')

def on_close(ws, code, reason):
    print(f'Disconnected (code={code}, reason={reason})')

def on_error(ws, error):
    print(f'Error: {error}')

url = f'{WS_URL}?api_key={API_KEY}&agent_id={AGENT_ID}'
ws = websocket.WebSocketApp(url,
    on_message=on_message,
    on_open=on_open,
    on_close=on_close,
    on_error=on_error)
ws.run_forever()

How It Works

  1. Connect — Opens a WebSocket to wss://api.withperf.pro/v1/voice/conversation with your API key and agent ID
  2. Wait for init — The conversation_initiation_metadata message signals the pipeline is ready
  3. Stream audio — A background thread reads microphone PCM16 chunks, base64-encodes them, and sends via WebSocket
  4. Play responses — Agent audio arrives as base64 PCM16 and is written directly to the speaker stream
  5. Keepalive — Responds to ping messages with pong to keep the connection alive

Audio Format

PropertyValue
EncodingPCM signed 16-bit integer (pyaudio.paInt16)
Sample rate16,000 Hz
Channels1 (mono)
Chunk size2048 samples
TransportBase64-encoded JSON messages

Sending Pre-Recorded Audio

To send a WAV file instead of live microphone input:
import wave

def send_wav_file(ws, filepath):
    """Send a WAV file as audio chunks."""
    with wave.open(filepath, 'rb') as wf:
        assert wf.getsampwidth() == 2, 'Must be 16-bit PCM'
        assert wf.getnchannels() == 1, 'Must be mono'
        assert wf.getframerate() == 16000, 'Must be 16kHz'

        while True:
            frames = wf.readframes(2048)
            if not frames:
                break
            b64 = base64.b64encode(frames).decode('utf-8')
            ws.send(json.dumps({'user_audio_chunk': b64}))
            # Pace to real-time (2048 samples at 16kHz = 128ms)
            time.sleep(0.128)

Saving Transcripts

transcripts = []

def on_message(ws, message):
    global ready
    data = json.loads(message)
    msg_type = data.get('type', '')

    if msg_type == 'agent_response':
        text = data.get('agent_response_event', {}).get('agent_response', '')
        if text:
            transcripts.append({'role': 'agent', 'text': text})

    elif msg_type == 'user_transcript':
        text = data.get('user_transcription_event', {}).get('user_transcript', '')
        if text:
            transcripts.append({'role': 'user', 'text': text})

    # ... handle other message types

# After conversation ends:
for t in transcripts:
    print(f"{t['role'].capitalize()}: {t['text']}")

Connection Test

Quick test without audio — verify authentication and agent connectivity:
import websocket
import json

url = 'wss://api.withperf.pro/v1/voice/conversation?api_key=YOUR_API_KEY&agent_id=YOUR_AGENT_ID'

ws = websocket.create_connection(url)
response = ws.recv()
data = json.loads(response)

if data.get('type') == 'conversation_initiation_metadata':
    conv_id = data['conversation_initiation_metadata_event']['conversation_id']
    print(f'Connected successfully. Session: {conv_id}')
else:
    print(f'Unexpected response: {data}')

ws.close()

Error Handling

ErrorCauseResolution
ConnectionRefusedInvalid URL or server downVerify the WebSocket URL
WebSocket close code 4001Invalid API keyCheck your api_key parameter
WebSocket close code 4004Invalid agent IDVerify the agent_id exists in your project
WebSocket close code 1008Sent audio before initWait for conversation_initiation_metadata
OSError: [Errno -9999]No audio device availableCheck microphone is connected and accessible
Audio crackling/gapsChunk size too small or CPU overloadedIncrease CHUNK size or reduce processing