Create a Python script called phi3v.py
with the code below.
This script launches a chatbot server using the Phi-3.5 vision model and ONNX Runtime.
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License
import argparse
import os
import glob
import time
from pathlib import Path
import onnxruntime_genai as og
def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
curr_path = Path(current_dir).absolute()
target_dir = glob.glob(target_dir_name, root_dir=curr_path)
if target_dir:
return Path(curr_path / target_dir[0]).absolute()
else:
if curr_path.parent == curr_path:
# Root dir
return None
return _find_dir_contains_sub_dir(curr_path / '..', target_dir_name)
def _complete(text, state):
return (glob.glob(text + "*") + [None])[state]
def run(args: argparse.Namespace):
print("Loading model...")
config = og.Config(args.model_path)
config.clear_providers()
if args.execution_provider != "cpu":
print(f"Setting model to {args.execution_provider}...")
config.append_provider(args.execution_provider)
model = og.Model(config)
print("Model loaded")
processor = model.create_multimodal_processor()
tokenizer_stream = processor.create_stream()
interactive = not args.non_interactive
while True:
if interactive:
try:
import readline
readline.set_completer_delims(" \t\n;")
readline.parse_and_bind("tab: complete")
readline.set_completer(_complete)
except ImportError:
# Not available on some platforms. Ignore it.
pass
image_paths = [
image_path.strip()
for image_path in input(
"Image Path (comma separated; leave empty if no image): "
).split(",")
]
else:
if args.image_paths:
image_paths = args.image_paths
else:
image_paths = [str(_find_dir_contains_sub_dir(Path(__file__).parent, "test") / "test_models" / "images" / "australia.jpg")]
image_paths = [image_path for image_path in image_paths if image_path]
images = None
prompt = "<|user|>\n"
if len(image_paths) == 0:
print("No image provided")
else:
for i, image_path in enumerate(image_paths):
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found: {image_path}")
print(f"Using image: {image_path}")
prompt += f"<|image_{i+1}|>\n"
images = og.Images.open(*image_paths)
if interactive:
text = input("Prompt: ")
else:
if args.prompt:
text = args.prompt
else:
text = "What is shown in this image?"
prompt += f"{text}<|end|>\n<|assistant|>\n"
print("Processing images and prompt...")
inputs = processor(prompt, images=images)
print("Generating response...")
start_time = time.time()
params = og.GeneratorParams(model)
params.set_inputs(inputs)
params.set_search_options(max_length=7680)
generator = og.Generator(model, params)
#start_time = time.time() # commented out and redundant
first_token_duration = None
token_count = 0
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
decoded_token = tokenizer_stream.decode(new_token)
token_count += 1
if token_count == 1:
ft_end = time.time()
first_token_duration = ft_end - start_time
print(decoded_token, end="", flush=True)
end_time = time.time()
total_run_time = end_time - start_time
tokens_per_sec = token_count / (end_time - ft_end)
print()
print(f"Total Time : {total_run_time:.4f} sec")
print(f"Time to First Token : {first_token_duration:.4f} sec")
print(f"Tokens per second : {tokens_per_sec:.2f} tokens/sec")
for _ in range(3):
print()
# Delete the generator to free the captured graph before creating another one
del generator
if not interactive:
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model_path", type=str, required=True, help="Path to the folder containing the model"
)
parser.add_argument(
"-e", "--execution_provider", type=str, required=True, choices=["cpu", "cuda", "dml"], help="Execution provider to run model"
)
parser.add_argument(
"--image_paths", nargs='*', type=str, required=False, help="Path to the images, mainly for CI usage"
)
parser.add_argument(
'-pr', '--prompt', required=False, help='Input prompts to generate tokens from, mainly for CI usage'
)
parser.add_argument(
'--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage'
)
args = parser.parse_args()
run(args)
You’re now ready to run the chatbot server.
Use the following command in a terminal to start the server:
python3 phi3v.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu
You should see output similar to the image below when the server starts successfully: