Completions¶

Language Models are trained to predict natural language and provide text outputs as a response to their inputs. The inputs are called prompts and outputs are referred to as completions. Completions hence are at the core of any generation AI application. Use the python SDK to generate intelligent responses to your users' requests in either blocking or streaming fashion. SGP supports a variety of open source and commercial models under the same unified interface (see Supported Models).

Synchronous Completions¶

An example to generate completions with the SGP Python SDk is as follows:

DefineCall

from typing import Literal, Iterable, Union

import dotenv

from scale_egp.sdk.client import EGPClient

ENV_FILE = ".env.local"
dotenv.load_dotenv(ENV_FILE, override=True)


def sync_completion(
    egp_client: EGPClient,
    model: Union[
        Literal[
            "gpt-4",
            "gpt-4-0613",
            "gpt-4-32k",
            "gpt-4-32k-0613",
            "gpt-3.5-turbo",
            "gpt-3.5-turbo-0613",
            "gpt-3.5-turbo-16k",
            "gpt-3.5-turbo-16k-0613",
            "text-davinci-003",
            "text-davinci-002",
            "text-curie-001",
            "text-babbage-001",
            "text-ada-001",
            "claude-instant-1",
            "claude-instant-1.1",
            "claude-2",
            "claude-2.0",
            "llama-7b",
            "llama-2-7b",
            "llama-2-7b-chat",
            "llama-2-13b",
            "llama-2-13b-chat",
            "llama-2-70b",
            "llama-2-70b-chat",
            "falcon-7b",
            "falcon-7b-instruct",
            "falcon-40b",
            "falcon-40b-instruct",
            "mpt-7b",
            "mpt-7b-instruct",
            "flan-t5-xxl",
            "mistral-7b",
            "mistral-7b-instruct",
            "mixtral-8x7b",
            "mixtral-8x7b-instruct",
            "llm-jp-13b-instruct-full",
            "llm-jp-13b-instruct-full-dolly",
            "zephyr-7b-alpha",
            "zephyr-7b-beta",
            "codellama-7b",
            "codellama-7b-instruct",
            "codellama-13b",
            "codellama-13b-instruct",
            "codellama-34b",
            "codellama-34b-instruct",
            "codellama-70b",
            "codellama-70b-instruct",
        ],
        str,
    ],
    input_prompt: str,
) -> str:
    completion = egp_client.completions().create(model=model, prompt=input_prompt)
    return completion.completion.text

import sys

if __name__ == "__main__":
    client = EGPClient()

    user_input = input("Enter a prompt to submit for a blocking sync completion request:\n")
    generated_text = sync_completion(
        egp_client=client,
        model="gpt-3.5-turbo",
        input_prompt=user_input,
    )
    print(f"AI Response:\n{generated_text}\n")

Token Streaming¶

The SGP Python SDK supports token streaming to reduce perceived latency for certain applications. When streaming, tokens will be sent as data-only server-side events.

DefineCall

from typing import Literal, Iterable, Union

import dotenv

from scale_egp.sdk.client import EGPClient


def stream_completion(
    egp_client: EGPClient,
    model: Union[
        Literal[
            "gpt-4",
            "gpt-4-0613",
            "gpt-4-32k",
            "gpt-4-32k-0613",
            "gpt-3.5-turbo",
            "gpt-3.5-turbo-0613",
            "gpt-3.5-turbo-16k",
            "gpt-3.5-turbo-16k-0613",
            "text-davinci-003",
            "text-davinci-002",
            "text-curie-001",
            "text-babbage-001",
            "text-ada-001",
            "claude-instant-1",
            "claude-instant-1.1",
            "claude-2",
            "claude-2.0",
            "llama-7b",
            "llama-2-7b",
            "llama-2-7b-chat",
            "llama-2-13b",
            "llama-2-13b-chat",
            "llama-2-70b",
            "llama-2-70b-chat",
            "falcon-7b",
            "falcon-7b-instruct",
            "falcon-40b",
            "falcon-40b-instruct",
            "mpt-7b",
            "mpt-7b-instruct",
            "flan-t5-xxl",
            "mistral-7b",
            "mistral-7b-instruct",
            "mixtral-8x7b",
            "mixtral-8x7b-instruct",
            "llm-jp-13b-instruct-full",
            "llm-jp-13b-instruct-full-dolly",
            "zephyr-7b-alpha",
            "zephyr-7b-beta",
            "codellama-7b",
            "codellama-7b-instruct",
            "codellama-13b",
            "codellama-13b-instruct",
            "codellama-34b",
            "codellama-34b-instruct",
            "codellama-70b",
            "codellama-70b-instruct",
        ],
        str,
    ],
    input_prompt: str,
) -> Iterable[str]:
    for completion in egp_client.completions().stream(model=model, prompt=input_prompt):
        yield completion.completion.text

import sys

if __name__ == "__main__":
    client = EGPClient()

    user_input = input("Enter a prompt to submit for a streaming completion request:\n")
    generated_text_generator = stream_completion(
        egp_client=client,
        model="gpt-3.5-turbo",
        input_prompt=user_input,
    )
    print(f"AI Response:")
    for generated_text in generated_text_generator:
        print(generated_text, end="")
        sys.stdout.flush()
    print()

See the full Completion SDK reference documentation to learn more.