microsoft · nmoeller · Aug 14, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 19, 2024
@@ -86,6 +86,9 @@ mistralai = [
 ollama = [
     "ollama ~= 0.2"
 ]
+onnx = [
+    "onnxruntime-genai ~= 0.4"
+]
 anthropic = [
     "anthropic ~= 0.32"
 ]

@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAIChatCompletion, OnnxGenAIPromptExecutionSettings
+from semantic_kernel.contents.chat_history import ChatHistory
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
+# If onnxruntime-genai is used:
+# use the model stored in /cpu folder
+# If onnxruntime-genai-cuda is installed for gpu use:
+# use the model stored in /cuda folder
+# Then set ONNX_GEN_AI_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = True
+
+chat_completion = OnnxGenAIChatCompletion(ai_model_id=service_id, template="phi3")
+settings = OnnxGenAIPromptExecutionSettings()
+
+system_message = """You are a helpful assistant."""
+chat_history = ChatHistory(system_message=system_message)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+    chat_history.add_user_message(user_input)
+    if streaming:
+        print("Mosscap:> ", end="")
+        message = ""
+        async for chunk in chat_completion.get_streaming_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        ):
+            if chunk:
+                print(str(chunk), end="")
+                message += str(chunk)
+        print("\n")
+        chat_history.add_assistant_message(message)
+    else:
+        answer = await chat_completion.get_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        )
+        print(f"Mosscap:> {answer}")
+        chat_history.add_assistant_message(answer)
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAIChatCompletion, OnnxGenAIPromptExecutionSettings
+from semantic_kernel.contents import AuthorRole, ChatMessageContent, ImageContent
+from semantic_kernel.contents.chat_history import ChatHistory
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# If onnxruntime-genai is used:
+# (https://huggingface.co/microsoft/Phi-3-vision-128k-instruct-onnx-cpu)
+# If onnxruntime-genai-cuda is installed for gpu use:
+# (https://huggingface.co/microsoft/Phi-3-vision-128k-instruct-onnx-gpu)
+# Then set ONNX_GEN_AI_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = False
+
+chat_completion = OnnxGenAIChatCompletion(ai_model_id=service_id, template="phi3v")
+
+# Max length prperty is important to allocate RAM
+# If the value is to big, you ran out of memory
+# If the value is to small, your input is limited
+settings = OnnxGenAIPromptExecutionSettings(max_length=3072)
+
+system_message = """
+You are a helpful assistant.
+You know about provided images and the history of the conversation.
+"""
+chat_history = ChatHistory(system_message=system_message)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+    chat_history.add_user_message(user_input)
+    if streaming:
+        print("Mosscap:> ", end="")
+        message = ""
+        async for chunk in chat_completion.get_streaming_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        ):
+            print(chunk.content, end="")
+            if chunk.content:
+                message += chunk.content
+        print("\n")
+        chat_history.add_message(message)
+    else:
+        answer = await chat_completion.get_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        )
+        print(f"Mosscap:> {answer}")
+        chat_history.add_message(answer)
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    image_path = input("Image Path (leave empty if no image): ")
+    if image_path:
+        chat_history.add_message(
+            ChatMessageContent(
+                role=AuthorRole.USER,
+                items=[
+                    ImageContent.from_image_path(image_path=image_path),
+                ],
+            ),
+        )
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAITextCompletion
+from semantic_kernel.functions.kernel_arguments import KernelArguments
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
+# If onnxruntime-genai is used:
+# use the model stored in /cpu folder
+# If onnxruntime-genai-cuda is installed for gpu use:
+# use the model stored in /cuda folder
+# Then set ONNX_GEN_AI_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = True
+
+kernel.add_service(OnnxGenAITextCompletion(ai_model_id=service_id))
+
+settings = kernel.get_prompt_execution_settings_from_service_id(service_id)
+
+chat_function = kernel.add_function(
+    plugin_name="ChatBot",
+    function_name="Chat",
+    prompt="<|user|>{{$user_input}}<|end|><|assistant|>",
+    template_format="semantic-kernel",
+    prompt_execution_settings=settings,
+)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    if streaming:
+        print("Mosscap:> ", end="")
+        async for chunk in kernel.invoke_stream(chat_function, KernelArguments(user_input=user_input)):
+            print(chunk[0].text, end="")
+        print("\n")
+    else:
+        answer = await kernel.invoke(chat_function, KernelArguments(user_input=user_input))
+        print(f"Mosscap:> {answer}")
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -68,6 +68,9 @@ def __init__(
         except ValidationError as ex:
             raise ServiceInitializationError("Failed to create Ollama settings.", ex) from ex
 
+        if not ollama_settings.model:
+            raise ServiceInitializationError("Please provide ai_model_id or OLLAMA_MODEL env variable is required")
+
         super().__init__(
             service_id=service_id or ollama_settings.model,
             ai_model_id=ollama_settings.model,

@@ -0,0 +1,9 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from semantic_kernel.connectors.ai.onnx.onnx_gen_ai_prompt_execution_settings import (
+    OnnxGenAIPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.onnx.services.onnx_gen_ai_chat_completion import OnnxGenAIChatCompletion
+from semantic_kernel.connectors.ai.onnx.services.onnx_gen_ai_text_completion import OnnxGenAITextCompletion
+
+__all__ = ['OnnxGenAIChatCompletion', 'OnnxGenAIPromptExecutionSettings', 'OnnxGenAITextCompletion']
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+from pydantic import Field
+
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+
+
+class OnnxGenAIPromptExecutionSettings(PromptExecutionSettings):
+    """OnnxGenAI prompt execution settings."""
+
+    diversity_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    do_sample: bool = False
+    early_stopping: bool = True
+    length_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    max_length: int = Field(3072, gt=0)
+    min_length: int | None = Field(None, gt=0)
+    no_repeat_ngram_size: int = 0
+    num_beams: int | None = Field(None, gt=0)
+    num_return_sequences: int | None = Field(None, gt=0)
+    past_present_share_buffer: int = True
+    repetition_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    temperature: float | None = Field(None, ge=0.0, le=2.0)
+    top_k: int | None = Field(None, gt=0)
+    top_p: float | None = Field(None, ge=0.0, le=1.0)
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from typing import ClassVar
+
+from semantic_kernel.kernel_pydantic import KernelBaseSettings
+
+
+class OnnxGenAISettings(KernelBaseSettings):
+    """Onnx Gen AI model settings.
+
+    The settings are first loaded from environment variables with the prefix 'ONNX_GEN_AI_'. If the
+    environment variables are not found, the settings can be loaded from a .env file with the
+    encoding 'utf-8'. If the settings are not found in the .env file, the settings are ignored;
+    however, validation will fail alerting that the settings are missing.
+
+    Optional settings for prefix 'ONNX_GEN_AI_' are:
+    - folder: Path to the Onnx model (ENV: ONNX_GEN_AI_FOLDER).
+    - env_file_path: if provided, the .env settings are read from this file path location
+    """
+
+    env_prefix: ClassVar[str] = "ONNX_GEN_AI_"
+    folder: str