Fu-Jie_openwebui-extensions/plugins/pipes/github-copilot-sdk/debug/test_system_message_resume.py

#!/usr/bin/env python3
"""
Copilot SDK System Message Test Script
Tests whether system_message is properly applied during session.resume

This script verifies the bug hypothesis:
- session.resume with system_message config may not reliably update the system prompt

Test scenarios:
1. Create a new session with a custom system message
2. Resume the same session with a DIFFERENT system message
3. Ask the model to describe its current system instructions

Requirements:
- github-copilot-sdk>=0.1.23
"""

import asyncio
import os
import sys
import time

from copilot import CopilotClient
from copilot.types import SessionConfig
from copilot.generated.session_events import SessionEventType


# Test system messages
SYSTEM_MSG_A = """You are a helpful assistant named "ALPHA".
When asked about your name or identity, you MUST respond: "I am ALPHA, the first assistant."
Always start your responses with "[ALPHA]:" prefix.
"""

SYSTEM_MSG_B = """You are a helpful assistant named "BETA".
When asked about your name or identity, you MUST respond: "I am BETA, the second assistant."
Always start your responses with "[BETA]:" prefix.
"""


async def send_and_get_response(session, prompt: str) -> str:
    """Send a message and collect the full response using event subscription."""
    full_response = ""
    response_complete = asyncio.Event()

    def event_handler(event):
        nonlocal full_response
        if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA:
            delta = getattr(event.data, "content", "") or ""
            print(delta, end="", flush=True)
            full_response += delta
        elif event.type == SessionEventType.ASSISTANT_MESSAGE:
            # Final complete message
            content = getattr(event.data, "content", "") or ""
            if content and not full_response:
                full_response = content
                print(content, end="", flush=True)
        elif event.type == SessionEventType.SESSION_IDLE:
            response_complete.set()
        elif event.type == SessionEventType.ASSISTANT_TURN_END:
            response_complete.set()

    # Subscribe to events
    unsubscribe = session.on(event_handler)

    try:
        # Send the message
        await session.send({"prompt": prompt, "mode": "immediate"})
        # Wait for completion (with timeout)
        await asyncio.wait_for(response_complete.wait(), timeout=120)
        print()  # newline after completion
    finally:
        unsubscribe()

    return full_response


async def test_new_session_system_message(client: CopilotClient):
    """Test 1: New session with system message A"""
    print("\n" + "=" * 60)
    print("TEST 1: New Session with System Message A (ALPHA)")
    print("=" * 60)

    session_config = SessionConfig(
        session_id="test-session-001",
        model="gpt-5-mini",
        streaming=True,
        system_message={
            "mode": "replace",
            "content": SYSTEM_MSG_A,
        },
    )

    session = await client.create_session(config=session_config)
    print(f"✅ Created new session: {session.session_id}")

    print("\n📤 Asking: 'What is your name?'")
    print("📥 Response: ", end="")
    response = await send_and_get_response(session, "What is your name?")

    if "ALPHA" in response:
        print("✅ SUCCESS: Model correctly identified as ALPHA")
    else:
        print("⚠️ WARNING: Model did NOT identify as ALPHA")

    return session


async def test_resume_session_with_new_system_message(
    client: CopilotClient, session_id: str
):
    """Test 2: Resume session with DIFFERENT system message B"""
    print("\n" + "=" * 60)
    print("TEST 2: Resume Session with System Message B (BETA)")
    print("=" * 60)

    resume_config = {
        "model": "gpt-5-mini",
        "streaming": True,
        "system_message": {
            "mode": "replace",
            "content": SYSTEM_MSG_B,
        },
    }

    print(f"📋 Resume config includes system_message with mode='replace'")
    print(f"📋 New system_message content: BETA identity")

    session = await client.resume_session(session_id, resume_config)
    print(f"✅ Resumed session: {session.session_id}")

    print("\n📤 Asking: 'What is your name now? Did your identity change?'")
    print("📥 Response: ", end="")
    response = await send_and_get_response(
        session, "What is your name now? Did your identity change?"
    )

    if "BETA" in response:
        print("✅ SUCCESS: System message was updated to BETA")
        return True
    elif "ALPHA" in response:
        print("❌ BUG CONFIRMED: System message was NOT updated (still ALPHA)")
        return False
    else:
        print("⚠️ INCONCLUSIVE: Model response doesn't clearly indicate identity")
        return None


async def test_resume_without_system_message(client: CopilotClient, session_id: str):
    """Test 3: Resume session without specifying system_message"""
    print("\n" + "=" * 60)
    print("TEST 3: Resume Session WITHOUT System Message")
    print("=" * 60)

    resume_config = {
        "model": "gpt-4o",
        "streaming": True,
        # No system_message specified
    }

    session = await client.resume_session(session_id, resume_config)
    print(f"✅ Resumed session: {session.session_id}")

    print("\n📤 Asking: 'What is your name? Tell me your current identity.'")
    print("📥 Response: ", end="")
    response = await send_and_get_response(
        session, "What is your name? Tell me your current identity."
    )

    if "ALPHA" in response:
        print(
            "ℹ️ Without system_message: Model still remembers ALPHA from original session"
        )
    elif "BETA" in response:
        print("ℹ️ Without system_message: Model remembers BETA from Test 2")
    else:
        print("ℹ️ Model identity unclear")


async def main():
    print("=" * 60)
    print("🧪 Copilot SDK System Message Resume Test")
    print("=" * 60)
    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Testing with SDK from: {CopilotClient.__module__}")

    # Create client with explicit CLI path if provided
    cli_path = os.environ.get("COPILOT_CLI_PATH")
    client_config = {"log_level": "info"}
    if cli_path:
        client_config["cli_path"] = cli_path

    client = CopilotClient(client_config)

    try:
        await client.start()
        print("✅ Client started successfully")

        # Test 1: Create new session with system message A
        session = await test_new_session_system_message(client)
        session_id = session.session_id

        # Wait a bit before resuming
        print("\n⏳ Waiting 2 seconds before resume test...")
        await asyncio.sleep(2)

        # Test 2: Resume with different system message B
        bug_confirmed = await test_resume_session_with_new_system_message(
            client, session_id
        )

        # Test 3: Resume without system message
        await test_resume_without_system_message(client, session_id)

        # Summary
        print("\n" + "=" * 60)
        print("📊 TEST SUMMARY (Native Copilot)")
        print("=" * 60)
        if bug_confirmed is False:
            print(
                "❌ BUG CONFIRMED: session.resume does NOT apply system_message updates"
            )
            print("   The system message from create_session persists even when")
            print("   resume_session specifies a different system_message.")
            print("\n   WORKAROUND: Inject system context into user prompt instead.")
        elif bug_confirmed is True:
            print("✅ NO BUG: session.resume correctly updates system_message")
        else:
            print("⚠️ INCONCLUSIVE: Could not determine if bug exists")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback

        traceback.print_exc()
    finally:
        await client.stop()
        print("\n✅ Client stopped")


# =============================================================================
# BYOK OpenAI Test
# =============================================================================


async def test_byok_new_session(client: CopilotClient, provider_config: dict):
    """BYOK Test 1: New session with BYOK provider and system message A"""
    print("\n" + "=" * 60)
    print("BYOK TEST 1: New Session with BYOK Provider + System Message A (ALPHA)")
    print("=" * 60)
    print(
        f"📋 Provider: {provider_config.get('type')} @ {provider_config.get('base_url')}"
    )

    session_config = SessionConfig(
        session_id="byok-test-session-001",
        model="gpt-4o",  # or your model name
        streaming=True,
        provider=provider_config,
        system_message={
            "mode": "replace",
            "content": SYSTEM_MSG_A,
        },
    )

    session = await client.create_session(config=session_config)
    print(f"✅ Created BYOK session: {session.session_id}")

    print("\n📤 Asking: 'What is your name?'")
    print("📥 Response: ", end="")
    response = await send_and_get_response(session, "What is your name?")

    if "ALPHA" in response:
        print("✅ SUCCESS: Model correctly identified as ALPHA")
    else:
        print("⚠️ WARNING: Model did NOT identify as ALPHA")

    return session


async def test_byok_resume_with_new_system_message(
    client: CopilotClient, session_id: str, provider_config: dict
):
    """BYOK Test 2: Resume BYOK session with DIFFERENT system message B"""
    print("\n" + "=" * 60)
    print("BYOK TEST 2: Resume BYOK Session with System Message B (BETA)")
    print("=" * 60)

    resume_config = {
        "model": "gpt-4o",
        "streaming": True,
        "provider": provider_config,
        "system_message": {
            "mode": "replace",
            "content": SYSTEM_MSG_B,
        },
    }

    print(f"📋 Resume config includes system_message with mode='replace'")
    print(f"📋 New system_message content: BETA identity")
    print(
        f"📋 Provider: {provider_config.get('type')} @ {provider_config.get('base_url')}"
    )

    session = await client.resume_session(session_id, resume_config)
    print(f"✅ Resumed BYOK session: {session.session_id}")

    print("\n📤 Asking: 'What is your name now? Did your identity change?'")
    print("📥 Response: ", end="")
    response = await send_and_get_response(
        session, "What is your name now? Did your identity change?"
    )

    if "BETA" in response:
        print("✅ SUCCESS: System message was updated to BETA")
        return True
    elif "ALPHA" in response:
        print("❌ BUG CONFIRMED: System message was NOT updated (still ALPHA)")
        return False
    else:
        print("⚠️ INCONCLUSIVE: Model response doesn't clearly indicate identity")
        return None


async def main_byok():
    """Run BYOK-specific tests"""
    print("=" * 60)
    print("🧪 Copilot SDK BYOK System Message Resume Test")
    print("=" * 60)
    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

    # Get BYOK configuration from environment
    byok_api_key = os.environ.get("BYOK_API_KEY") or os.environ.get("OPENAI_API_KEY")
    byok_base_url = os.environ.get("BYOK_BASE_URL", "https://api.openai.com/v1")
    byok_model = os.environ.get("BYOK_MODEL", "gpt-4o")

    if not byok_api_key:
        print(
            "❌ Error: Please set BYOK_API_KEY or OPENAI_API_KEY environment variable"
        )
        print("   export BYOK_API_KEY='your_api_key'")
        print("   export BYOK_BASE_URL='https://api.openai.com/v1'  # optional")
        print("   export BYOK_MODEL='gpt-4o'  # optional")
        return

    provider_config = {
        "type": "openai",
        "base_url": byok_base_url,
        "api_key": byok_api_key,
    }

    print(f"📋 BYOK Provider: openai @ {byok_base_url}")
    print(f"📋 BYOK Model: {byok_model}")

    # Create client
    cli_path = os.environ.get("COPILOT_CLI_PATH")
    client_config = {"log_level": "info"}
    if cli_path:
        client_config["cli_path"] = cli_path

    client = CopilotClient(client_config)

    try:
        await client.start()
        print("✅ Client started successfully")

        # BYOK Test 1: Create new session with BYOK provider
        session = await test_byok_new_session(client, provider_config)
        session_id = session.session_id

        # Wait a bit before resuming
        print("\n⏳ Waiting 2 seconds before resume test...")
        await asyncio.sleep(2)

        # BYOK Test 2: Resume with different system message B
        bug_confirmed = await test_byok_resume_with_new_system_message(
            client, session_id, provider_config
        )

        # Summary
        print("\n" + "=" * 60)
        print("📊 BYOK TEST SUMMARY")
        print("=" * 60)
        if bug_confirmed is False:
            print(
                "❌ BYOK BUG CONFIRMED: session.resume does NOT apply system_message updates"
            )
            print("   In BYOK mode, the system message from create_session persists")
            print("   even when resume_session specifies a different system_message.")
            print("\n   WORKAROUND: Inject system context into user prompt instead.")
        elif bug_confirmed is True:
            print("✅ BYOK NO BUG: session.resume correctly updates system_message")
        else:
            print("⚠️ BYOK INCONCLUSIVE: Could not determine if bug exists")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback

        traceback.print_exc()
    finally:
        await client.stop()
        print("\n✅ Client stopped")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Copilot SDK System Message Resume Test"
    )
    parser.add_argument(
        "--byok",
        action="store_true",
        help="Run BYOK (Bring Your Own Key) test instead of native Copilot test",
    )
    args = parser.parse_args()

    if args.byok:
        print("Running BYOK test mode...")
        asyncio.run(main_byok())
    else:
        print("Running native Copilot test mode...")
        print("(Use --byok flag for BYOK provider test)")
        asyncio.run(main())