diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index fed9f7cb39..d40984866b 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -290,7 +290,7 @@ $ dstack ps --verbose -??? info "Probe statuses" +??? info "Status" The following symbols are used for probe statuses: - `×` — the last probe execution failed. @@ -328,6 +328,11 @@ Probes are executed for each service replica while the replica is `running`. A p +??? info "Model" + If you set the [`model`](#model) property but don't explicitly configure `probes`, + `dstack` automatically configures a default probe that tests the model using the `/v1/chat/completions` API. + To disable probes entirely when `model` is set, explicitly set `probes` to an empty list. + See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options. ### Path prefix { #path-prefix } @@ -425,6 +430,9 @@ Limits apply to the whole service (all replicas) and per client (by IP). Clients If the service runs a model with an OpenAI-compatible interface, you can set the [`model`](#model) property to make the model accessible through `dstack`'s chat UI on the `Models` page. In this case, `dstack` will use the service's `/v1/chat/completions` service. +When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health. +To customize or disable this, set `probes` explicitly. + ### Resources If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index fc76fe43ed..1077eff8a9 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -354,7 +354,7 @@ def interpolate_env(self, conf: RunConfigurationT): password=interpolator.interpolate_or_error(conf.registry_auth.password), ) if isinstance(conf, ServiceConfiguration): - for probe in conf.probes: + for probe in conf.probes or []: for header in probe.headers: header.value = interpolator.interpolate_or_error(header.value) if probe.url: diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 3b2c7812b9..465df261b1 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -56,6 +56,8 @@ DEFAULT_PROBE_METHOD = "get" MAX_PROBE_URL_LEN = 2048 DEFAULT_REPLICA_GROUP_NAME = "0" +DEFAULT_MODEL_PROBE_TIMEOUT = 30 +DEFAULT_MODEL_PROBE_URL = "/v1/chat/completions" class RunConfigurationType(str, Enum): @@ -851,9 +853,13 @@ class ServiceConfigurationParams(CoreModel): ] = None rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = [] probes: Annotated[ - list[ProbeConfig], - Field(description="List of probes used to determine job health"), - ] = [] + Optional[list[ProbeConfig]], + Field( + description="The list of probes to determine service health. " + "If `model` is set, defaults to a `/v1/chat/completions` probe. " + "Set explicitly to override" + ), + ] = None # None = omitted (may get default when model is set); [] = explicit empty replicas: Annotated[ Optional[Union[List[ReplicaGroup], Range[int]]], @@ -895,7 +901,9 @@ def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]: return v @validator("probes") - def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]: + def validate_probes(cls, v: Optional[list[ProbeConfig]]) -> Optional[list[ProbeConfig]]: + if v is None: + return v if has_duplicates(v): # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug: # https://github.com/pydantic/pydantic/issues/3765 diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index df6738a774..8e3d2374d7 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -5,12 +5,15 @@ from pathlib import PurePosixPath from typing import Dict, List, Optional +import orjson from cachetools import TTLCache, cached from dstack._internal import settings from dstack._internal.core.errors import DockerRegistryError, ServerClientError from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( + DEFAULT_MODEL_PROBE_TIMEOUT, + DEFAULT_MODEL_PROBE_URL, DEFAULT_PROBE_INTERVAL, DEFAULT_PROBE_METHOD, DEFAULT_PROBE_READY_AFTER, @@ -18,6 +21,7 @@ DEFAULT_PROBE_URL, DEFAULT_REPLICA_GROUP_NAME, LEGACY_REPO_DIR, + HTTPHeaderSpec, PortMapping, ProbeConfig, PythonVersion, @@ -39,6 +43,7 @@ Retry, RunSpec, ) +from dstack._internal.core.models.services import OpenAIChatModel from dstack._internal.core.models.unix import UnixUser from dstack._internal.core.models.volumes import MountPoint, VolumeMountPoint from dstack._internal.core.services.profiles import get_retry @@ -394,7 +399,13 @@ def _service_port(self) -> Optional[int]: def _probes(self) -> list[ProbeSpec]: if isinstance(self.run_spec.configuration, ServiceConfiguration): - return list(map(_probe_config_to_spec, self.run_spec.configuration.probes)) + probes = self.run_spec.configuration.probes + if probes is not None: + return list(map(_probe_config_to_spec, probes)) + # Generate default probe if model is set + model = self.run_spec.configuration.model + if isinstance(model, OpenAIChatModel): + return [_default_model_probe_spec(model.name)] return [] @@ -447,6 +458,28 @@ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec: ) +def _default_model_probe_spec(model_name: str) -> ProbeSpec: + body = orjson.dumps( + { + "model": model_name, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, + } + ).decode("utf-8") + return ProbeSpec( + type="http", + method="post", + url=DEFAULT_MODEL_PROBE_URL, + headers=[ + HTTPHeaderSpec(name="Content-Type", value="application/json"), + ], + body=body, + timeout=DEFAULT_MODEL_PROBE_TIMEOUT, + interval=DEFAULT_PROBE_INTERVAL, + ready_after=DEFAULT_PROBE_READY_AFTER, + ) + + def _join_shell_commands(commands: List[str]) -> str: for i, cmd in enumerate(commands): cmd = cmd.strip() diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index db81eb724a..ad2fcef1ff 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -94,13 +94,13 @@ def validate_run_spec_and_set_defaults( raise ServerClientError( "Scheduled services with autoscaling to zero are not supported" ) - if len(run_spec.configuration.probes) > settings.MAX_PROBES_PER_JOB: + if len(run_spec.configuration.probes or []) > settings.MAX_PROBES_PER_JOB: raise ServerClientError( f"Cannot configure more than {settings.MAX_PROBES_PER_JOB} probes" ) if any( p.timeout is not None and p.timeout > settings.MAX_PROBE_TIMEOUT - for p in run_spec.configuration.probes + for p in (run_spec.configuration.probes or []) ): raise ServerClientError( f"Probe timeout cannot be longer than {settings.MAX_PROBE_TIMEOUT}s" diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 65eec62642..44c31f5cbb 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -7,12 +7,53 @@ from dstack._internal.core.models.configurations import ( DevEnvironmentConfigurationParams, RepoSpec, + ServiceConfiguration, parse_run_configuration, ) from dstack._internal.core.models.resources import Range class TestParseConfiguration: + def test_service_model_probes_none_when_omitted(self): + """When model is set but probes omitted, probes should remain None. + The default probe is generated server-side in the job configurator.""" + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is None + + def test_service_model_does_not_override_explicit_probes(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [{"type": "http", "url": "/health"}], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None + assert len(parsed.probes) == 1 + assert parsed.probes[0].url == "/health" + + def test_service_model_explicit_empty_probes_no_default(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None + assert len(parsed.probes) == 0 + def test_services_replicas_and_scaling(self): def test_conf(replicas: Any, scaling: Optional[Any] = None): conf = { diff --git a/src/tests/_internal/server/services/jobs/configurators/test_service.py b/src/tests/_internal/server/services/jobs/configurators/test_service.py new file mode 100644 index 0000000000..b52ee297a5 --- /dev/null +++ b/src/tests/_internal/server/services/jobs/configurators/test_service.py @@ -0,0 +1,98 @@ +import pytest + +from dstack._internal.core.models.configurations import ( + DEFAULT_MODEL_PROBE_TIMEOUT, + DEFAULT_MODEL_PROBE_URL, + ProbeConfig, + ServiceConfiguration, +) +from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator +from dstack._internal.server.testing.common import get_run_spec + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestProbes: + async def test_default_probe_when_model_set(self): + """When model is set but probes omitted, a default model probe should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + probe = probes[0] + assert probe.type == "http" + assert probe.method == "post" + assert probe.url == DEFAULT_MODEL_PROBE_URL + assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT + assert len(probe.headers) == 1 + assert probe.headers[0].name == "Content-Type" + assert probe.headers[0].value == "application/json" + assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "") + assert "max_tokens" in (probe.body or "") + + async def test_explicit_probes_not_overridden(self): + """When probes are explicitly set, they should be used as-is.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[ProbeConfig(type="http", url="/health")], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + assert probes[0].url == "/health" + + async def test_explicit_empty_probes(self): + """When probes is explicitly set to empty list, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0 + + async def test_no_probe_when_no_model(self): + """When neither model nor probes are set, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0