Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion docs/docs/concepts/services.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ $ dstack ps --verbose

</div>

??? info "Probe statuses"
??? info "Status"
The following symbols are used for probe statuses:

- `×` &mdash; the last probe execution failed.
Expand Down Expand Up @@ -328,6 +328,11 @@ Probes are executed for each service replica while the replica is `running`. A p

</div>

??? info "Model"
If you set the [`model`](#model) property but don't explicitly configure `probes`,
`dstack` automatically configures a default probe that tests the model using the `/v1/chat/completions` API.
To disable probes entirely when `model` is set, explicitly set `probes` to an empty list.

See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options.

### Path prefix { #path-prefix }
Expand Down Expand Up @@ -425,6 +430,9 @@ Limits apply to the whole service (all replicas) and per client (by IP). Clients
If the service runs a model with an OpenAI-compatible interface, you can set the [`model`](#model) property to make the model accessible through `dstack`'s chat UI on the `Models` page.
In this case, `dstack` will use the service's `/v1/chat/completions` service.

When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health.
To customize or disable this, set `probes` explicitly.

### Resources

If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a
Expand Down
2 changes: 1 addition & 1 deletion src/dstack/_internal/cli/services/configurators/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def interpolate_env(self, conf: RunConfigurationT):
password=interpolator.interpolate_or_error(conf.registry_auth.password),
)
if isinstance(conf, ServiceConfiguration):
for probe in conf.probes:
for probe in conf.probes or []:
for header in probe.headers:
header.value = interpolator.interpolate_or_error(header.value)
if probe.url:
Expand Down
16 changes: 12 additions & 4 deletions src/dstack/_internal/core/models/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
DEFAULT_PROBE_METHOD = "get"
MAX_PROBE_URL_LEN = 2048
DEFAULT_REPLICA_GROUP_NAME = "0"
DEFAULT_MODEL_PROBE_TIMEOUT = 30
DEFAULT_MODEL_PROBE_URL = "/v1/chat/completions"


class RunConfigurationType(str, Enum):
Expand Down Expand Up @@ -851,9 +853,13 @@ class ServiceConfigurationParams(CoreModel):
] = None
rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
probes: Annotated[
list[ProbeConfig],
Field(description="List of probes used to determine job health"),
] = []
Optional[list[ProbeConfig]],
Field(
description="The list of probes to determine service health. "
"If `model` is set, defaults to a `/v1/chat/completions` probe. "
"Set explicitly to override"
),
] = None # None = omitted (may get default when model is set); [] = explicit empty

replicas: Annotated[
Optional[Union[List[ReplicaGroup], Range[int]]],
Expand Down Expand Up @@ -895,7 +901,9 @@ def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]:
return v

@validator("probes")
def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
def validate_probes(cls, v: Optional[list[ProbeConfig]]) -> Optional[list[ProbeConfig]]:
if v is None:
return v
if has_duplicates(v):
# Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
# https://github.com/pydantic/pydantic/issues/3765
Expand Down
35 changes: 34 additions & 1 deletion src/dstack/_internal/server/services/jobs/configurators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,23 @@
from pathlib import PurePosixPath
from typing import Dict, List, Optional

import orjson
from cachetools import TTLCache, cached

from dstack._internal import settings
from dstack._internal.core.errors import DockerRegistryError, ServerClientError
from dstack._internal.core.models.common import RegistryAuth
from dstack._internal.core.models.configurations import (
DEFAULT_MODEL_PROBE_TIMEOUT,
DEFAULT_MODEL_PROBE_URL,
DEFAULT_PROBE_INTERVAL,
DEFAULT_PROBE_METHOD,
DEFAULT_PROBE_READY_AFTER,
DEFAULT_PROBE_TIMEOUT,
DEFAULT_PROBE_URL,
DEFAULT_REPLICA_GROUP_NAME,
LEGACY_REPO_DIR,
HTTPHeaderSpec,
PortMapping,
ProbeConfig,
PythonVersion,
Expand All @@ -39,6 +43,7 @@
Retry,
RunSpec,
)
from dstack._internal.core.models.services import OpenAIChatModel
from dstack._internal.core.models.unix import UnixUser
from dstack._internal.core.models.volumes import MountPoint, VolumeMountPoint
from dstack._internal.core.services.profiles import get_retry
Expand Down Expand Up @@ -394,7 +399,13 @@ def _service_port(self) -> Optional[int]:

def _probes(self) -> list[ProbeSpec]:
if isinstance(self.run_spec.configuration, ServiceConfiguration):
return list(map(_probe_config_to_spec, self.run_spec.configuration.probes))
probes = self.run_spec.configuration.probes
if probes is not None:
return list(map(_probe_config_to_spec, probes))
# Generate default probe if model is set
model = self.run_spec.configuration.model
if isinstance(model, OpenAIChatModel):
return [_default_model_probe_spec(model.name)]
return []


Expand Down Expand Up @@ -447,6 +458,28 @@ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec:
)


def _default_model_probe_spec(model_name: str) -> ProbeSpec:
body = orjson.dumps(
{
"model": model_name,
"messages": [{"role": "user", "content": "hi"}],
"max_tokens": 1,
}
).decode("utf-8")
return ProbeSpec(
type="http",
method="post",
url=DEFAULT_MODEL_PROBE_URL,
headers=[
HTTPHeaderSpec(name="Content-Type", value="application/json"),
],
body=body,
timeout=DEFAULT_MODEL_PROBE_TIMEOUT,
interval=DEFAULT_PROBE_INTERVAL,
ready_after=DEFAULT_PROBE_READY_AFTER,
)


def _join_shell_commands(commands: List[str]) -> str:
for i, cmd in enumerate(commands):
cmd = cmd.strip()
Expand Down
4 changes: 2 additions & 2 deletions src/dstack/_internal/server/services/runs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@ def validate_run_spec_and_set_defaults(
raise ServerClientError(
"Scheduled services with autoscaling to zero are not supported"
)
if len(run_spec.configuration.probes) > settings.MAX_PROBES_PER_JOB:
if len(run_spec.configuration.probes or []) > settings.MAX_PROBES_PER_JOB:
raise ServerClientError(
f"Cannot configure more than {settings.MAX_PROBES_PER_JOB} probes"
)
if any(
p.timeout is not None and p.timeout > settings.MAX_PROBE_TIMEOUT
for p in run_spec.configuration.probes
for p in (run_spec.configuration.probes or [])
):
raise ServerClientError(
f"Probe timeout cannot be longer than {settings.MAX_PROBE_TIMEOUT}s"
Expand Down
41 changes: 41 additions & 0 deletions src/tests/_internal/core/models/test_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,53 @@
from dstack._internal.core.models.configurations import (
DevEnvironmentConfigurationParams,
RepoSpec,
ServiceConfiguration,
parse_run_configuration,
)
from dstack._internal.core.models.resources import Range


class TestParseConfiguration:
def test_service_model_probes_none_when_omitted(self):
"""When model is set but probes omitted, probes should remain None.
The default probe is generated server-side in the job configurator."""
conf = {
"type": "service",
"commands": ["python3 -m http.server"],
"port": 8000,
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
}
parsed = parse_run_configuration(conf)
assert isinstance(parsed, ServiceConfiguration)
assert parsed.probes is None

def test_service_model_does_not_override_explicit_probes(self):
conf = {
"type": "service",
"commands": ["python3 -m http.server"],
"port": 8000,
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"probes": [{"type": "http", "url": "/health"}],
}
parsed = parse_run_configuration(conf)
assert isinstance(parsed, ServiceConfiguration)
assert parsed.probes is not None
assert len(parsed.probes) == 1
assert parsed.probes[0].url == "/health"

def test_service_model_explicit_empty_probes_no_default(self):
conf = {
"type": "service",
"commands": ["python3 -m http.server"],
"port": 8000,
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"probes": [],
}
parsed = parse_run_configuration(conf)
assert isinstance(parsed, ServiceConfiguration)
assert parsed.probes is not None
assert len(parsed.probes) == 0

def test_services_replicas_and_scaling(self):
def test_conf(replicas: Any, scaling: Optional[Any] = None):
conf = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import pytest

from dstack._internal.core.models.configurations import (
DEFAULT_MODEL_PROBE_TIMEOUT,
DEFAULT_MODEL_PROBE_URL,
ProbeConfig,
ServiceConfiguration,
)
from dstack._internal.core.models.services import OpenAIChatModel
from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
from dstack._internal.server.testing.common import get_run_spec


@pytest.mark.asyncio
@pytest.mark.usefixtures("image_config_mock")
class TestProbes:
async def test_default_probe_when_model_set(self):
"""When model is set but probes omitted, a default model probe should be generated."""
configuration = ServiceConfiguration(
port=80,
image="debian",
model=OpenAIChatModel(
name="meta-llama/Meta-Llama-3.1-8B-Instruct",
format="openai",
),
)
run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
configurator = ServiceJobConfigurator(run_spec)

job_specs = await configurator.get_job_specs(replica_num=0)

assert len(job_specs) == 1
probes = job_specs[0].probes
assert len(probes) == 1
probe = probes[0]
assert probe.type == "http"
assert probe.method == "post"
assert probe.url == DEFAULT_MODEL_PROBE_URL
assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT
assert len(probe.headers) == 1
assert probe.headers[0].name == "Content-Type"
assert probe.headers[0].value == "application/json"
assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "")
assert "max_tokens" in (probe.body or "")

async def test_explicit_probes_not_overridden(self):
"""When probes are explicitly set, they should be used as-is."""
configuration = ServiceConfiguration(
port=80,
image="debian",
model=OpenAIChatModel(
name="meta-llama/Meta-Llama-3.1-8B-Instruct",
format="openai",
),
probes=[ProbeConfig(type="http", url="/health")],
)
run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
configurator = ServiceJobConfigurator(run_spec)

job_specs = await configurator.get_job_specs(replica_num=0)

assert len(job_specs) == 1
probes = job_specs[0].probes
assert len(probes) == 1
assert probes[0].url == "/health"

async def test_explicit_empty_probes(self):
"""When probes is explicitly set to empty list, no probes should be generated."""
configuration = ServiceConfiguration(
port=80,
image="debian",
model=OpenAIChatModel(
name="meta-llama/Meta-Llama-3.1-8B-Instruct",
format="openai",
),
probes=[],
)
run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
configurator = ServiceJobConfigurator(run_spec)

job_specs = await configurator.get_job_specs(replica_num=0)

assert len(job_specs) == 1
assert len(job_specs[0].probes) == 0

async def test_no_probe_when_no_model(self):
"""When neither model nor probes are set, no probes should be generated."""
configuration = ServiceConfiguration(
port=80,
image="debian",
)
run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
configurator = ServiceJobConfigurator(run_spec)

job_specs = await configurator.get_job_specs(replica_num=0)

assert len(job_specs) == 1
assert len(job_specs[0].probes) == 0