From 6a3acc4548312d0ea60ed4404fa2371c86cda99d Mon Sep 17 00:00:00 2001 From: PythicCoder Date: Tue, 11 Mar 2025 01:53:53 +0200 Subject: [PATCH] Feature add Add LlamaCppChatCompletionClient and llama-cpp (#5326) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pull request introduces the integration of the `llama-cpp` library into the `autogen-ext` package, with significant changes to the project dependencies and the implementation of a new chat completion client. The most important changes include updating the project dependencies, adding a new module for the `LlamaCppChatCompletionClient`, and implementing the client with various functionalities. ### Project Dependencies: * [`python/packages/autogen-ext/pyproject.toml`](diffhunk://#diff-095119d4420ff09059557bd25681211d1772c2be0fbe0ff2d551a3726eff1b4bR34-R38): Added `llama-cpp-python` as a new dependency under the `llama-cpp` section. ### New Module: * [`python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py`](diffhunk://#diff-42ae3ba17d51ca917634c4ea3c5969cf930297c288a783f8d9c126f2accef71dR1-R8): Introduced the `LlamaCppChatCompletionClient` class and handled import errors with a descriptive message for missing dependencies. ### Implementation of `LlamaCppChatCompletionClient`: * `python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py`: - Added the `LlamaCppChatCompletionClient` class with methods to initialize the client, create chat completions, detect and execute tools, and handle streaming responses. - Included detailed logging for debugging purposes and implemented methods to count tokens, track usage, and provide model information.…d chat capabilities ## Why are these changes needed? ## Related issue number ## Checks - [X ] I've included any doc changes needed for https://microsoft.github.io/autogen/. See https://microsoft.github.io/autogen/docs/Contribute#documentation to build and test documentation locally. - [X ] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [ X] I've made sure all auto checks have passed. --------- Co-authored-by: aribornstein Co-authored-by: Eric Zhu Co-authored-by: Ryan Sweet --- .github/workflows/checks.yml | 2 +- .../autogen-core/docs/src/reference/index.md | 1 + .../python/autogen_ext.models.llama_cpp.rst | 9 + python/packages/autogen-ext/pyproject.toml | 5 + .../autogen_ext/models/llama_cpp/__init__.py | 10 + .../llama_cpp/_llama_cpp_completion_client.py | 426 ++++++++++++++++++ .../test_commandline_code_executor.py | 3 +- .../models/test_llama_cpp_model_client.py | 205 +++++++++ python/uv.lock | 16 + 9 files changed, 674 insertions(+), 3 deletions(-) create mode 100644 python/packages/autogen-core/docs/src/reference/python/autogen_ext.models.llama_cpp.rst create mode 100644 python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py create mode 100644 python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py create mode 100644 python/packages/autogen-ext/tests/models/test_llama_cpp_model_client.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 99714fe7e..3b204f5f9 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -197,7 +197,7 @@ jobs: - name: Install Python deps run: | - uv sync --locked --all-extras + uv sync --locked --all-extras --no-extra llama-cpp shell: pwsh working-directory: ./python diff --git a/python/packages/autogen-core/docs/src/reference/index.md b/python/packages/autogen-core/docs/src/reference/index.md index e312c288d..d498a60fc 100644 --- a/python/packages/autogen-core/docs/src/reference/index.md +++ b/python/packages/autogen-core/docs/src/reference/index.md @@ -54,6 +54,7 @@ python/autogen_ext.models.azure python/autogen_ext.models.anthropic python/autogen_ext.models.semantic_kernel python/autogen_ext.models.ollama +python/autogen_ext.models.llama_cpp python/autogen_ext.tools.code_execution python/autogen_ext.tools.graphrag python/autogen_ext.tools.http diff --git a/python/packages/autogen-core/docs/src/reference/python/autogen_ext.models.llama_cpp.rst b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.models.llama_cpp.rst new file mode 100644 index 000000000..56470d36d --- /dev/null +++ b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.models.llama_cpp.rst @@ -0,0 +1,9 @@ +autogen\_ext.models.llama\_cpp +============================== + + +.. automodule:: autogen_ext.models.llama_cpp + :members: + :undoc-members: + :show-inheritance: + :member-order: bysource diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml index e8a1ae27c..22fed0c7f 100644 --- a/python/packages/autogen-ext/pyproject.toml +++ b/python/packages/autogen-ext/pyproject.toml @@ -32,6 +32,11 @@ file-surfer = [ "autogen-agentchat==0.4.8", "markitdown~=0.0.1", ] + +llama-cpp = [ + "llama-cpp-python>=0.1.9", +] + graphrag = ["graphrag>=1.0.1"] chromadb = ["chromadb"] web-surfer = [ diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py new file mode 100644 index 000000000..0324e4005 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py @@ -0,0 +1,10 @@ +try: + from ._llama_cpp_completion_client import LlamaCppChatCompletionClient +except ImportError as e: + raise ImportError( + "Dependencies for Llama Cpp not found. " + "Please install llama-cpp-python: " + "pip install autogen-ext[llama-cpp]" + ) from e + +__all__ = ["LlamaCppChatCompletionClient"] diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py new file mode 100644 index 000000000..dc7772114 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py @@ -0,0 +1,426 @@ +import logging # added import +import re +from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Sequence, TypedDict, Union, cast + +from autogen_core import EVENT_LOGGER_NAME, CancellationToken, FunctionCall, MessageHandlerContext +from autogen_core.logging import LLMCallEvent +from autogen_core.models import ( + AssistantMessage, + ChatCompletionClient, + CreateResult, + FinishReasons, + FunctionExecutionResultMessage, + LLMMessage, + ModelInfo, + RequestUsage, + SystemMessage, + UserMessage, + validate_model_info, +) +from autogen_core.tools import Tool, ToolSchema +from llama_cpp import ( + ChatCompletionFunctionParameters, + ChatCompletionRequestAssistantMessage, + ChatCompletionRequestFunctionMessage, + ChatCompletionRequestSystemMessage, + ChatCompletionRequestToolMessage, + ChatCompletionRequestUserMessage, + ChatCompletionTool, + ChatCompletionToolFunction, + Llama, + llama_chat_format, +) +from typing_extensions import Unpack + +logger = logging.getLogger(EVENT_LOGGER_NAME) # initialize logger + + +def normalize_stop_reason(stop_reason: str | None) -> FinishReasons: + if stop_reason is None: + return "unknown" + + # Convert to lower case + stop_reason = stop_reason.lower() + + KNOWN_STOP_MAPPINGS: Dict[str, FinishReasons] = { + "stop": "stop", + "length": "length", + "content_filter": "content_filter", + "function_calls": "function_calls", + "end_turn": "stop", + "tool_calls": "function_calls", + } + + return KNOWN_STOP_MAPPINGS.get(stop_reason, "unknown") + + +def normalize_name(name: str) -> str: + """ + LLMs sometimes ask functions while ignoring their own format requirements, this function should be used to replace invalid characters with "_". + + Prefer _assert_valid_name for validating user configuration or input + """ + return re.sub(r"[^a-zA-Z0-9_-]", "_", name)[:64] + + +def assert_valid_name(name: str) -> str: + """ + Ensure that configured names are valid, raises ValueError if not. + + For munging LLM responses use _normalize_name to ensure LLM specified names don't break the API. + """ + if not re.match(r"^[a-zA-Z0-9_-]+$", name): + raise ValueError(f"Invalid name: {name}. Only letters, numbers, '_' and '-' are allowed.") + if len(name) > 64: + raise ValueError(f"Invalid name: {name}. Name must be less than 64 characters.") + return name + + +def convert_tools( + tools: Sequence[Tool | ToolSchema], +) -> List[ChatCompletionTool]: + result: List[ChatCompletionTool] = [] + for tool in tools: + if isinstance(tool, Tool): + tool_schema = tool.schema + else: + assert isinstance(tool, dict) + tool_schema = tool + + result.append( + ChatCompletionTool( + type="function", + function=ChatCompletionToolFunction( + name=tool_schema["name"], + description=(tool_schema["description"] if "description" in tool_schema else ""), + parameters=( + cast(ChatCompletionFunctionParameters, tool_schema["parameters"]) + if "parameters" in tool_schema + else {} + ), + ), + ) + ) + # Check if all tools have valid names. + for tool_param in result: + assert_valid_name(tool_param["function"]["name"]) + return result + + +class LlamaCppParams(TypedDict, total=False): + # from_pretrained parameters: + repo_id: Optional[str] + filename: Optional[str] + additional_files: Optional[List[Any]] + local_dir: Optional[str] + local_dir_use_symlinks: Union[bool, Literal["auto"]] + cache_dir: Optional[str] + # __init__ parameters: + model_path: str + n_gpu_layers: int + split_mode: int + main_gpu: int + tensor_split: Optional[List[float]] + rpc_servers: Optional[str] + vocab_only: bool + use_mmap: bool + use_mlock: bool + kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] + seed: int + n_ctx: int + n_batch: int + n_ubatch: int + n_threads: Optional[int] + n_threads_batch: Optional[int] + rope_scaling_type: Optional[int] + pooling_type: int + rope_freq_base: float + rope_freq_scale: float + yarn_ext_factor: float + yarn_attn_factor: float + yarn_beta_fast: float + yarn_beta_slow: float + yarn_orig_ctx: int + logits_all: bool + embedding: bool + offload_kqv: bool + flash_attn: bool + no_perf: bool + last_n_tokens_size: int + lora_base: Optional[str] + lora_scale: float + lora_path: Optional[str] + numa: Union[bool, int] + chat_format: Optional[str] + chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] + draft_model: Optional[Any] # LlamaDraftModel not exposed by llama_cpp + tokenizer: Optional[Any] # BaseLlamaTokenizer not exposed by llama_cpp + type_k: Optional[int] + type_v: Optional[int] + spm_infill: bool + verbose: bool + + +class LlamaCppChatCompletionClient(ChatCompletionClient): + """Chat completion client for LlamaCpp models. + To use this client, you must install the `llama-cpp` extra: + + .. code-block:: bash + + pip install "autogen-ext[llama-cpp]" + + This client allows you to interact with LlamaCpp models, either by specifying a local model path or by downloading a model from Hugging Face Hub. + + Args: + model_path (optional, str): The path to the LlamaCpp model file. Required if repo_id and filename are not provided. + repo_id (optional, str): The Hugging Face Hub repository ID. Required if model_path is not provided. + filename (optional, str): The filename of the model within the Hugging Face Hub repository. Required if model_path is not provided. + n_gpu_layers (optional, int): The number of layers to put on the GPU. + n_ctx (optional, int): The context size. + n_batch (optional, int): The batch size. + verbose (optional, bool): Whether to print verbose output. + model_info (optional, ModelInfo): The capabilities of the model. Defaults to a ModelInfo instance with function_calling set to True. + **kwargs: Additional parameters to pass to the Llama class. + + Examples: + + The following code snippet shows how to use the client with a local model file: + + .. code-block:: python + + import asyncio + + from autogen_core.models import UserMessage + from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient + + + async def main(): + llama_client = LlamaCppChatCompletionClient(model_path="/path/to/your/model.gguf") + result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")]) + print(result) + + + asyncio.run(main()) + + The following code snippet shows how to use the client with a model from Hugging Face Hub: + + .. code-block:: python + + import asyncio + + from autogen_core.models import UserMessage + from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient + + + async def main(): + llama_client = LlamaCppChatCompletionClient( + repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000 + ) + result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")]) + print(result) + + + asyncio.run(main()) + """ + + def __init__( + self, + model_info: Optional[ModelInfo] = None, + **kwargs: Unpack[LlamaCppParams], + ) -> None: + """ + Initialize the LlamaCpp client. + """ + + if model_info: + validate_model_info(model_info) + + if "repo_id" in kwargs and "filename" in kwargs and kwargs["repo_id"] and kwargs["filename"]: + repo_id: str = cast(str, kwargs.pop("repo_id")) + filename: str = cast(str, kwargs.pop("filename")) + pretrained = Llama.from_pretrained(repo_id=repo_id, filename=filename, **kwargs) # type: ignore + assert isinstance(pretrained, Llama) + self.llm = pretrained + + elif "model_path" in kwargs: + self.llm = Llama(**kwargs) # pyright: ignore[reportUnknownMemberType] + else: + raise ValueError("Please provide model_path if ... or provide repo_id and filename if ....") + self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0} + + async def create( + self, + messages: Sequence[LLMMessage], + *, + tools: Sequence[Tool | ToolSchema] = [], + # None means do not override the default + # A value means to override the client default - often specified in the constructor + json_output: Optional[bool] = None, + extra_create_args: Mapping[str, Any] = {}, + cancellation_token: Optional[CancellationToken] = None, + ) -> CreateResult: + # Convert LLMMessage objects to dictionaries with 'role' and 'content' + # converted_messages: List[Dict[str, str | Image | list[str | Image] | list[FunctionCall]]] = [] + converted_messages: list[ + ChatCompletionRequestSystemMessage + | ChatCompletionRequestUserMessage + | ChatCompletionRequestAssistantMessage + | ChatCompletionRequestUserMessage + | ChatCompletionRequestToolMessage + | ChatCompletionRequestFunctionMessage + ] = [] + for msg in messages: + if isinstance(msg, SystemMessage): + converted_messages.append({"role": "system", "content": msg.content}) + elif isinstance(msg, UserMessage) and isinstance(msg.content, str): + converted_messages.append({"role": "user", "content": msg.content}) + elif isinstance(msg, AssistantMessage) and isinstance(msg.content, str): + converted_messages.append({"role": "assistant", "content": msg.content}) + elif ( + isinstance(msg, SystemMessage) or isinstance(msg, UserMessage) or isinstance(msg, AssistantMessage) + ) and isinstance(msg.content, list): + raise ValueError("Multi-part messages such as those containing images are currently not supported.") + else: + raise ValueError(f"Unsupported message type: {type(msg)}") + + if self.model_info["function_calling"]: + response = self.llm.create_chat_completion( + messages=converted_messages, tools=convert_tools(tools), stream=False + ) + else: + response = self.llm.create_chat_completion(messages=converted_messages, stream=False) + + if not isinstance(response, dict): + raise ValueError("Unexpected response type from LlamaCpp model.") + + self._total_usage["prompt_tokens"] += response["usage"]["prompt_tokens"] + self._total_usage["completion_tokens"] += response["usage"]["completion_tokens"] + + # Parse the response + response_tool_calls: ChatCompletionTool | None = None + response_text: str | None = None + if "choices" in response and len(response["choices"]) > 0: + if "message" in response["choices"][0]: + response_text = response["choices"][0]["message"]["content"] + if "tool_calls" in response["choices"][0]: + response_tool_calls = response["choices"][0]["tool_calls"] # type: ignore + + content: List[FunctionCall] | str = "" + thought: str | None = None + if response_tool_calls: + content = [] + for tool_call in response_tool_calls: + if not isinstance(tool_call, dict): + raise ValueError("Unexpected tool call type from LlamaCpp model.") + content.append( + FunctionCall( + id=tool_call["id"], + arguments=tool_call["function"]["arguments"], + name=normalize_name(tool_call["function"]["name"]), + ) + ) + if response_text and len(response_text) > 0: + thought = response_text + else: + if response_text: + content = response_text + + # Detect tool usage in the response + if not response_tool_calls and not response_text: + logger.debug("DEBUG: No response text found. Returning empty response.") + return CreateResult( + content="", usage=RequestUsage(prompt_tokens=0, completion_tokens=0), finish_reason="stop", cached=False + ) + + # Create a CreateResult object + if "finish_reason" in response["choices"][0]: + finish_reason = response["choices"][0]["finish_reason"] + else: + finish_reason = "unknown" + if finish_reason not in ("stop", "length", "function_calls", "content_filter", "unknown"): + finish_reason = "unknown" + create_result = CreateResult( + content=content, + thought=thought, + usage=cast(RequestUsage, response["usage"]), + finish_reason=normalize_stop_reason(finish_reason), # type: ignore + cached=False, + ) + + # If we are running in the context of a handler we can get the agent_id + try: + agent_id = MessageHandlerContext.agent_id() + except RuntimeError: + agent_id = None + + logger.info( + LLMCallEvent( + messages=cast(List[Dict[str, Any]], converted_messages), + response=create_result.model_dump(), + prompt_tokens=response["usage"]["prompt_tokens"], + completion_tokens=response["usage"]["completion_tokens"], + agent_id=agent_id, + ) + ) + return create_result + + async def create_stream( + self, + messages: Sequence[LLMMessage], + *, + tools: Sequence[Tool | ToolSchema] = [], + # None means do not override the default + # A value means to override the client default - often specified in the constructor + json_output: Optional[bool] = None, + extra_create_args: Mapping[str, Any] = {}, + cancellation_token: Optional[CancellationToken] = None, + ) -> AsyncGenerator[Union[str, CreateResult], None]: + raise NotImplementedError("Stream not yet implemented for LlamaCppChatCompletionClient") + yield "" + + # Implement abstract methods + def actual_usage(self) -> RequestUsage: + return RequestUsage( + prompt_tokens=self._total_usage.get("prompt_tokens", 0), + completion_tokens=self._total_usage.get("completion_tokens", 0), + ) + + @property + def capabilities(self) -> ModelInfo: + return self.model_info + + def count_tokens( + self, + messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], + **kwargs: Any, + ) -> int: + total = 0 + for msg in messages: + # Use the Llama model's tokenizer to encode the content + tokens = self.llm.tokenize(str(msg.content).encode("utf-8")) + total += len(tokens) + return total + + @property + def model_info(self) -> ModelInfo: + return ModelInfo(vision=False, json_output=False, family="llama-cpp", function_calling=True) + + def remaining_tokens( + self, + messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], + **kwargs: Any, + ) -> int: + used_tokens = self.count_tokens(messages) + return max(self.llm.n_ctx() - used_tokens, 0) + + def total_usage(self) -> RequestUsage: + return RequestUsage( + prompt_tokens=self._total_usage.get("prompt_tokens", 0), + completion_tokens=self._total_usage.get("completion_tokens", 0), + ) + + async def close(self) -> None: + """ + Close the LlamaCpp client. + """ + self.llm.close() diff --git a/python/packages/autogen-ext/tests/code_executors/test_commandline_code_executor.py b/python/packages/autogen-ext/tests/code_executors/test_commandline_code_executor.py index 7ff87909a..4e138a53b 100644 --- a/python/packages/autogen-ext/tests/code_executors/test_commandline_code_executor.py +++ b/python/packages/autogen-ext/tests/code_executors/test_commandline_code_executor.py @@ -3,8 +3,8 @@ import asyncio import os -import shutil import platform +import shutil import sys import tempfile import venv @@ -18,7 +18,6 @@ from autogen_core import CancellationToken from autogen_core.code_executor import CodeBlock from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor - HAS_POWERSHELL: bool = platform.system() == "Windows" and ( shutil.which("powershell") is not None or shutil.which("pwsh") is not None ) diff --git a/python/packages/autogen-ext/tests/models/test_llama_cpp_model_client.py b/python/packages/autogen-ext/tests/models/test_llama_cpp_model_client.py new file mode 100644 index 000000000..1898fe7a7 --- /dev/null +++ b/python/packages/autogen-ext/tests/models/test_llama_cpp_model_client.py @@ -0,0 +1,205 @@ +import contextlib +import sys +from typing import TYPE_CHECKING, Any, ContextManager, Generator, List, Sequence, Union + +import pytest +import torch + +# from autogen_agentchat.agents import AssistantAgent +# from autogen_agentchat.messages import TextMessage +# from autogen_core import CancellationToken +from autogen_core.models import RequestUsage, SystemMessage, UserMessage + +# from autogen_core.tools import FunctionTool +try: + from llama_cpp import ChatCompletionMessageToolCalls + + if TYPE_CHECKING: + from autogen_ext.models.llama_cpp._llama_cpp_completion_client import LlamaCppChatCompletionClient +except ImportError: + # If llama_cpp is not installed, we can't run the tests. + pytest.skip("Skipping LlamaCppChatCompletionClient tests: llama-cpp-python not installed", allow_module_level=True) + + +# Fake Llama class to simulate responses +class FakeLlama: + def __init__( + self, + model_path: str, + **_: Any, + ) -> None: + self.model_path = model_path + self.n_ctx = lambda: 1024 + + # Added tokenize method for testing purposes. + def tokenize(self, b: bytes) -> list[int]: + return list(b) + + def create_chat_completion( + self, messages: Any, tools: List[ChatCompletionMessageToolCalls] | None, stream: bool = False + ) -> dict[str, Any]: + # Return fake non-streaming response. + + return { + "usage": {"prompt_tokens": 1, "completion_tokens": 2}, + "choices": [{"message": {"content": "Fake response"}}], + } + + def __call__(self, prompt: str, stream: bool = True) -> Generator[dict[str, Any], None, None]: + # Yield fake streaming tokens. + yield {"choices": [{"text": "Hello "}]} + yield {"choices": [{"text": "World"}]} + + +@pytest.fixture +@contextlib.contextmanager +def get_completion_client( + monkeypatch: pytest.MonkeyPatch, +) -> "Generator[type[LlamaCppChatCompletionClient], None, None]": + with monkeypatch.context() as m: + m.setattr("llama_cpp.Llama", FakeLlama) + from autogen_ext.models.llama_cpp._llama_cpp_completion_client import LlamaCppChatCompletionClient + + yield LlamaCppChatCompletionClient + sys.modules.pop("autogen_ext.models.llama_cpp._llama_cpp_completion_client", None) + sys.modules.pop("llama_cpp", None) + + +@pytest.mark.asyncio +async def test_llama_cpp_create(get_completion_client: "ContextManager[type[LlamaCppChatCompletionClient]]") -> None: + with get_completion_client as Client: + client = Client(model_path="dummy") + messages: Sequence[Union[SystemMessage, UserMessage]] = [ + SystemMessage(content="Test system"), + UserMessage(content="Test user", source="user"), + ] + result = await client.create(messages=messages) + assert result.content == "Fake response" + usage: RequestUsage = result.usage + assert usage.prompt_tokens == 1 + assert usage.completion_tokens == 2 + assert result.finish_reason in ("stop", "unknown") + + +# Commmented out due to raising not implemented error will leave in case streaming is supported in the future. +# @pytest.mark.asyncio +# async def test_llama_cpp_create_stream( +# get_completion_client: "ContextManager[type[LlamaCppChatCompletionClient]]", +# ) -> None: +# with get_completion_client as Client: +# client = Client(filename="dummy") +# messages: Sequence[Union[SystemMessage, UserMessage]] = [ +# SystemMessage(content="Test system"), +# UserMessage(content="Test user", source="user"), +# ] +# collected = "" +# async for token in client.create_stream(messages=messages): +# collected += token +# assert collected == "Hello World" + + +@pytest.mark.asyncio +async def test_create_invalid_message( + get_completion_client: "ContextManager[type[LlamaCppChatCompletionClient]]", +) -> None: + with get_completion_client as Client: + client = Client(model_path="dummy") + # Pass an unsupported message type (integer) to trigger ValueError. + with pytest.raises(ValueError, match="Unsupported message type"): + await client.create(messages=[123]) # type: ignore + + +@pytest.mark.asyncio +async def test_count_and_remaining_tokens( + get_completion_client: "ContextManager[type[LlamaCppChatCompletionClient]]", monkeypatch: pytest.MonkeyPatch +) -> None: + with get_completion_client as Client: + client = Client(model_path="dummy") + msg = SystemMessage(content="Test") + # count_tokens should count the bytes + token_count = client.count_tokens([msg]) + # Since "Test" encoded is 4 bytes, expect 4 tokens. + assert token_count >= 4 + remaining = client.remaining_tokens([msg]) + # remaining should be (1024 - token_count); ensure non-negative. + assert remaining == max(1024 - token_count, 0) + + +@pytest.mark.asyncio +async def test_llama_cpp_integration_non_streaming() -> None: + if not ((hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) or torch.cuda.is_available()): + pytest.skip("Skipping LlamaCpp integration tests: GPU not available not set") + + from autogen_ext.models.llama_cpp._llama_cpp_completion_client import LlamaCppChatCompletionClient + + client = LlamaCppChatCompletionClient( + repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000 + ) + messages: Sequence[Union[SystemMessage, UserMessage]] = [ + SystemMessage(content="You are a helpful assistant."), + UserMessage(content="Hello, how are you?", source="user"), + ] + result = await client.create(messages=messages) + assert isinstance(result.content, str) and len(result.content.strip()) > 0 + + +# Commmented out due to raising not implemented error will leave in case streaming is supported in the future. +# @pytest.mark.asyncio +# async def test_llama_cpp_integration_streaming() -> None: +# if not ((hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) or torch.cuda.is_available()): +# pytest.skip("Skipping LlamaCpp integration tests: GPU not available not set") + +# from autogen_ext.models.llama_cpp._llama_cpp_completion_client import LlamaCppChatCompletionClient +# client = LlamaCppChatCompletionClient( +# repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000 +# ) +# messages: Sequence[Union[SystemMessage, UserMessage]] = [ +# SystemMessage(content="You are a helpful assistant."), +# UserMessage(content="Please stream your response.", source="user"), +# ] +# collected = "" +# async for token in client.create_stream(messages=messages): +# collected += token +# assert isinstance(collected, str) and len(collected.strip()) > 0 + +# Commented out tool use as this functionality is not yet implemented for Phi-4. +# Define tools (functions) for the AssistantAgent +# def add(num1: int, num2: int) -> int: +# """Add two numbers together""" +# return num1 + num2 + + +# @pytest.mark.asyncio +# async def test_llama_cpp_integration_tool_use() -> None: +# if not ((hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) or torch.cuda.is_available()): +# pytest.skip("Skipping LlamaCpp integration tests: GPU not available not set") + +# from autogen_ext.models.llama_cpp._llama_cpp_completion_client import LlamaCppChatCompletionClient + +# model_client = LlamaCppChatCompletionClient( +# repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000 +# ) + +# # Initialize the AssistantAgent +# assistant = AssistantAgent( +# name="assistant", +# system_message=("You can add two numbers together using the `add` function. "), +# model_client=model_client, +# tools=[ +# FunctionTool( +# add, +# description="Add two numbers together. The first argument is num1 and second is num2. The return value is num1 + num2", +# ) +# ], +# reflect_on_tool_use=True, # Reflect on tool results +# ) + +# # Test the tool +# response = await assistant.on_messages( +# [ +# TextMessage(content="add 3 and 4", source="user"), +# ], +# CancellationToken(), +# ) + +# assert "7" in response.chat_message.content diff --git a/python/uv.lock b/python/uv.lock index 12ef4315f..b6c38aa8d 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -628,6 +628,9 @@ jupyter-executor = [ langchain = [ { name = "langchain-core" }, ] +llama-cpp = [ + { name = "llama-cpp-python" }, +] magentic-one = [ { name = "autogen-agentchat" }, { name = "markitdown" }, @@ -735,6 +738,7 @@ requires-dist = [ { name = "json-schema-to-pydantic", marker = "extra == 'http-tool'", specifier = ">=0.2.0" }, { name = "json-schema-to-pydantic", marker = "extra == 'mcp'", specifier = ">=0.2.2" }, { name = "langchain-core", marker = "extra == 'langchain'", specifier = "~=0.3.3" }, + { name = "llama-cpp-python", marker = "extra == 'llama-cpp'", specifier = ">=0.1.9" }, { name = "markitdown", marker = "extra == 'file-surfer'", specifier = "~=0.0.1" }, { name = "markitdown", marker = "extra == 'magentic-one'", specifier = "~=0.0.1" }, { name = "markitdown", marker = "extra == 'web-surfer'", specifier = "~=0.0.1" }, @@ -3501,6 +3505,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/b1/9355547c3b9043ba2821e7797f322c753dfa4d2a3da7bb05690fce689eaa/llama_cloud-0.1.11-py3-none-any.whl", hash = "sha256:b703765d03783a5a0fc57a52adc9892f8b91b0c19bbecb85a54ad4e813342951", size = 250609 }, ] +[[package]] +name = "llama-cpp-python" +version = "0.3.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "diskcache" }, + { name = "jinja2" }, + { name = "numpy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/38/7a47b1fb1d83eaddd86ca8ddaf20f141cbc019faf7b425283d8e5ef710e5/llama_cpp_python-0.3.7.tar.gz", hash = "sha256:0566a0dcc0f38005c4093309a87f67c2452449522e3e17e15cd735a62957894c", size = 66715891 } + [[package]] name = "llama-index" version = "0.12.14"