#!/usr/bin/env python3 """ Copilot SDK System Message Test Script Tests whether system_message is properly applied during session.resume This script verifies the bug hypothesis: - session.resume with system_message config may not reliably update the system prompt Test scenarios: 1. Create a new session with a custom system message 2. Resume the same session with a DIFFERENT system message 3. Ask the model to describe its current system instructions Requirements: - github-copilot-sdk>=0.1.23 """ import asyncio import os import sys import time from copilot import CopilotClient from copilot.types import SessionConfig from copilot.generated.session_events import SessionEventType # Test system messages SYSTEM_MSG_A = """You are a helpful assistant named "ALPHA". When asked about your name or identity, you MUST respond: "I am ALPHA, the first assistant." Always start your responses with "[ALPHA]:" prefix. """ SYSTEM_MSG_B = """You are a helpful assistant named "BETA". When asked about your name or identity, you MUST respond: "I am BETA, the second assistant." Always start your responses with "[BETA]:" prefix. """ async def send_and_get_response(session, prompt: str) -> str: """Send a message and collect the full response using event subscription.""" full_response = "" response_complete = asyncio.Event() def event_handler(event): nonlocal full_response if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA: delta = getattr(event.data, "content", "") or "" print(delta, end="", flush=True) full_response += delta elif event.type == SessionEventType.ASSISTANT_MESSAGE: # Final complete message content = getattr(event.data, "content", "") or "" if content and not full_response: full_response = content print(content, end="", flush=True) elif event.type == SessionEventType.SESSION_IDLE: response_complete.set() elif event.type == SessionEventType.ASSISTANT_TURN_END: response_complete.set() # Subscribe to events unsubscribe = session.on(event_handler) try: # Send the message await session.send({"prompt": prompt, "mode": "immediate"}) # Wait for completion (with timeout) await asyncio.wait_for(response_complete.wait(), timeout=120) print() # newline after completion finally: unsubscribe() return full_response async def test_new_session_system_message(client: CopilotClient): """Test 1: New session with system message A""" print("\n" + "=" * 60) print("TEST 1: New Session with System Message A (ALPHA)") print("=" * 60) session_config = SessionConfig( session_id="test-session-001", model="gpt-5-mini", streaming=True, system_message={ "mode": "replace", "content": SYSTEM_MSG_A, }, ) session = await client.create_session(config=session_config) print(f"โœ… Created new session: {session.session_id}") print("\n๐Ÿ“ค Asking: 'What is your name?'") print("๐Ÿ“ฅ Response: ", end="") response = await send_and_get_response(session, "What is your name?") if "ALPHA" in response: print("โœ… SUCCESS: Model correctly identified as ALPHA") else: print("โš ๏ธ WARNING: Model did NOT identify as ALPHA") return session async def test_resume_session_with_new_system_message( client: CopilotClient, session_id: str ): """Test 2: Resume session with DIFFERENT system message B""" print("\n" + "=" * 60) print("TEST 2: Resume Session with System Message B (BETA)") print("=" * 60) resume_config = { "model": "gpt-5-mini", "streaming": True, "system_message": { "mode": "replace", "content": SYSTEM_MSG_B, }, } print(f"๐Ÿ“‹ Resume config includes system_message with mode='replace'") print(f"๐Ÿ“‹ New system_message content: BETA identity") session = await client.resume_session(session_id, resume_config) print(f"โœ… Resumed session: {session.session_id}") print("\n๐Ÿ“ค Asking: 'What is your name now? Did your identity change?'") print("๐Ÿ“ฅ Response: ", end="") response = await send_and_get_response( session, "What is your name now? Did your identity change?" ) if "BETA" in response: print("โœ… SUCCESS: System message was updated to BETA") return True elif "ALPHA" in response: print("โŒ BUG CONFIRMED: System message was NOT updated (still ALPHA)") return False else: print("โš ๏ธ INCONCLUSIVE: Model response doesn't clearly indicate identity") return None async def test_resume_without_system_message(client: CopilotClient, session_id: str): """Test 3: Resume session without specifying system_message""" print("\n" + "=" * 60) print("TEST 3: Resume Session WITHOUT System Message") print("=" * 60) resume_config = { "model": "gpt-4o", "streaming": True, # No system_message specified } session = await client.resume_session(session_id, resume_config) print(f"โœ… Resumed session: {session.session_id}") print("\n๐Ÿ“ค Asking: 'What is your name? Tell me your current identity.'") print("๐Ÿ“ฅ Response: ", end="") response = await send_and_get_response( session, "What is your name? Tell me your current identity." ) if "ALPHA" in response: print( "โ„น๏ธ Without system_message: Model still remembers ALPHA from original session" ) elif "BETA" in response: print("โ„น๏ธ Without system_message: Model remembers BETA from Test 2") else: print("โ„น๏ธ Model identity unclear") async def main(): print("=" * 60) print("๐Ÿงช Copilot SDK System Message Resume Test") print("=" * 60) print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}") print(f"Testing with SDK from: {CopilotClient.__module__}") # Create client with explicit CLI path if provided cli_path = os.environ.get("COPILOT_CLI_PATH") client_config = {"log_level": "info"} if cli_path: client_config["cli_path"] = cli_path client = CopilotClient(client_config) try: await client.start() print("โœ… Client started successfully") # Test 1: Create new session with system message A session = await test_new_session_system_message(client) session_id = session.session_id # Wait a bit before resuming print("\nโณ Waiting 2 seconds before resume test...") await asyncio.sleep(2) # Test 2: Resume with different system message B bug_confirmed = await test_resume_session_with_new_system_message( client, session_id ) # Test 3: Resume without system message await test_resume_without_system_message(client, session_id) # Summary print("\n" + "=" * 60) print("๐Ÿ“Š TEST SUMMARY (Native Copilot)") print("=" * 60) if bug_confirmed is False: print( "โŒ BUG CONFIRMED: session.resume does NOT apply system_message updates" ) print(" The system message from create_session persists even when") print(" resume_session specifies a different system_message.") print("\n WORKAROUND: Inject system context into user prompt instead.") elif bug_confirmed is True: print("โœ… NO BUG: session.resume correctly updates system_message") else: print("โš ๏ธ INCONCLUSIVE: Could not determine if bug exists") except Exception as e: print(f"โŒ Error: {e}") import traceback traceback.print_exc() finally: await client.stop() print("\nโœ… Client stopped") # ============================================================================= # BYOK OpenAI Test # ============================================================================= async def test_byok_new_session(client: CopilotClient, provider_config: dict): """BYOK Test 1: New session with BYOK provider and system message A""" print("\n" + "=" * 60) print("BYOK TEST 1: New Session with BYOK Provider + System Message A (ALPHA)") print("=" * 60) print( f"๐Ÿ“‹ Provider: {provider_config.get('type')} @ {provider_config.get('base_url')}" ) session_config = SessionConfig( session_id="byok-test-session-001", model="gpt-4o", # or your model name streaming=True, provider=provider_config, system_message={ "mode": "replace", "content": SYSTEM_MSG_A, }, ) session = await client.create_session(config=session_config) print(f"โœ… Created BYOK session: {session.session_id}") print("\n๐Ÿ“ค Asking: 'What is your name?'") print("๐Ÿ“ฅ Response: ", end="") response = await send_and_get_response(session, "What is your name?") if "ALPHA" in response: print("โœ… SUCCESS: Model correctly identified as ALPHA") else: print("โš ๏ธ WARNING: Model did NOT identify as ALPHA") return session async def test_byok_resume_with_new_system_message( client: CopilotClient, session_id: str, provider_config: dict ): """BYOK Test 2: Resume BYOK session with DIFFERENT system message B""" print("\n" + "=" * 60) print("BYOK TEST 2: Resume BYOK Session with System Message B (BETA)") print("=" * 60) resume_config = { "model": "gpt-4o", "streaming": True, "provider": provider_config, "system_message": { "mode": "replace", "content": SYSTEM_MSG_B, }, } print(f"๐Ÿ“‹ Resume config includes system_message with mode='replace'") print(f"๐Ÿ“‹ New system_message content: BETA identity") print( f"๐Ÿ“‹ Provider: {provider_config.get('type')} @ {provider_config.get('base_url')}" ) session = await client.resume_session(session_id, resume_config) print(f"โœ… Resumed BYOK session: {session.session_id}") print("\n๐Ÿ“ค Asking: 'What is your name now? Did your identity change?'") print("๐Ÿ“ฅ Response: ", end="") response = await send_and_get_response( session, "What is your name now? Did your identity change?" ) if "BETA" in response: print("โœ… SUCCESS: System message was updated to BETA") return True elif "ALPHA" in response: print("โŒ BUG CONFIRMED: System message was NOT updated (still ALPHA)") return False else: print("โš ๏ธ INCONCLUSIVE: Model response doesn't clearly indicate identity") return None async def main_byok(): """Run BYOK-specific tests""" print("=" * 60) print("๐Ÿงช Copilot SDK BYOK System Message Resume Test") print("=" * 60) print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}") # Get BYOK configuration from environment byok_api_key = os.environ.get("BYOK_API_KEY") or os.environ.get("OPENAI_API_KEY") byok_base_url = os.environ.get("BYOK_BASE_URL", "https://api.openai.com/v1") byok_model = os.environ.get("BYOK_MODEL", "gpt-4o") if not byok_api_key: print( "โŒ Error: Please set BYOK_API_KEY or OPENAI_API_KEY environment variable" ) print(" export BYOK_API_KEY='your_api_key'") print(" export BYOK_BASE_URL='https://api.openai.com/v1' # optional") print(" export BYOK_MODEL='gpt-4o' # optional") return provider_config = { "type": "openai", "base_url": byok_base_url, "api_key": byok_api_key, } print(f"๐Ÿ“‹ BYOK Provider: openai @ {byok_base_url}") print(f"๐Ÿ“‹ BYOK Model: {byok_model}") # Create client cli_path = os.environ.get("COPILOT_CLI_PATH") client_config = {"log_level": "info"} if cli_path: client_config["cli_path"] = cli_path client = CopilotClient(client_config) try: await client.start() print("โœ… Client started successfully") # BYOK Test 1: Create new session with BYOK provider session = await test_byok_new_session(client, provider_config) session_id = session.session_id # Wait a bit before resuming print("\nโณ Waiting 2 seconds before resume test...") await asyncio.sleep(2) # BYOK Test 2: Resume with different system message B bug_confirmed = await test_byok_resume_with_new_system_message( client, session_id, provider_config ) # Summary print("\n" + "=" * 60) print("๐Ÿ“Š BYOK TEST SUMMARY") print("=" * 60) if bug_confirmed is False: print( "โŒ BYOK BUG CONFIRMED: session.resume does NOT apply system_message updates" ) print(" In BYOK mode, the system message from create_session persists") print(" even when resume_session specifies a different system_message.") print("\n WORKAROUND: Inject system context into user prompt instead.") elif bug_confirmed is True: print("โœ… BYOK NO BUG: session.resume correctly updates system_message") else: print("โš ๏ธ BYOK INCONCLUSIVE: Could not determine if bug exists") except Exception as e: print(f"โŒ Error: {e}") import traceback traceback.print_exc() finally: await client.stop() print("\nโœ… Client stopped") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Copilot SDK System Message Resume Test" ) parser.add_argument( "--byok", action="store_true", help="Run BYOK (Bring Your Own Key) test instead of native Copilot test", ) args = parser.parse_args() if args.byok: print("Running BYOK test mode...") asyncio.run(main_byok()) else: print("Running native Copilot test mode...") print("(Use --byok flag for BYOK provider test)") asyncio.run(main())