diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..0bd59bd --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,92 @@ +# .ci — CI 镜像与流水线 + +``` +.ci/ +├── config.yaml # 统一配置(镜像、job 定义) +├── build.py # 镜像构建 +├── run.py # CI 流水线执行 +└── images/ + ├── nvidia/Dockerfile + └── ascend/Dockerfile +``` + +**前置依赖**:Docker、Python 3.10+、`pip install pyyaml` + +--- + +## 配置文件 `config.yaml` + +```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + +jobs: + nvidia_gpu: + image: latest # latest | + platform: nvidia + resources: + gpu_ids: "0" # "0" | "0,2" | "all" + memory: 32GB + shm_size: 16g # 避免 PyTorch SHMEM 不足 + timeout: 3600 # 容器内脚本最大运行秒数 + setup: pip install .[dev] + env: # 可选,注入容器环境变量 + MY_VAR: value + stages: + - name: test + run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml +``` + +--- + +## 镜像构建 `build.py` + +| 参数 | 说明 | +|---|---| +| `--platform nvidia\|ascend\|all` | 构建平台,默认 `all` | +| `--force` | 跳过 Dockerfile 变更检测 | +| `--dry-run` | 打印命令不执行 | + +```bash +# 检测变更后构建(无变更自动跳过) +python .ci/build.py --platform nvidia + +# 强制构建 +python .ci/build.py --platform nvidia --force +``` + +构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 +代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。 + +> `--push` 为预留功能,需在 `config.yaml` 中配置 `registry` 段后方可使用。 + +--- + +## 流水线执行 `run.py` + +| 参数 | 说明 | +|---|---| +| `--branch` | 覆盖克隆分支 | +| `--stage` | 只运行指定 stage | +| `--image-tag` | 覆盖镜像 tag | +| `--gpu-id` | 覆盖 GPU 设备 ID | +| `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | +| `--dry-run` | 打印 docker 命令不执行 | + +```bash +# 运行默认 job +python .ci/run.py --branch feat/my-feature --results-dir ./ci-results + +# 只跑 test stage,预览命令 +python .ci/run.py --stage test --dry-run +``` + +容器内执行流程:`git clone` → `checkout` → `setup` → stages。 +代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..2339319 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import shlex +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) + sys.exit(1) + + return result.stdout.strip() + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + print( + "warning: git diff failed (shallow clone or initial commit?);" + " assuming Dockerfile changed", + file=sys.stderr, + ) + return True + + return bool(result.stdout.strip()) + + +def docker_login(registry_cfg, dry_run): + """Log in to the registry using `credentials_env` token. + + Returns True on success. + + NOTE: Registry support is currently unused (`config.yaml` has no registry + section). Retained for future integration with an external image management + system. + """ + credentials_env = registry_cfg.get("credentials_env") + registry_url = registry_cfg.get("url", "") + + if not credentials_env or not registry_url: + return True + + token = os.environ.get(credentials_env) + + if not token: + print( + f"error: {credentials_env} not set, cannot login", + file=sys.stderr, + ) + return False + + if dry_run: + print( + f"[dry-run] echo | docker login {registry_url}" + " --username token --password-stdin" + ) + return True + + result = subprocess.run( + ["docker", "login", registry_url, "--username", "token", "--password-stdin"], + input=token, + text=True, + ) + + if result.returncode != 0: + print("error: docker login failed", file=sys.stderr) + return False + + return True + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + + if private_sdk: + source_env = private_sdk.get("source_env", "") + sdk_url = os.environ.get(source_env, "") if source_env else "" + + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {shlex.join(build_cmd)}") + + if push: + if not logged_in: + print("[dry-run] (skipping push: docker login failed)") + else: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + if not logged_in: + print("error: docker login failed, cannot push", file=sys.stderr) + return False + + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building (requires registry in config)", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist," + f" skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, + platform_cfg, + registry_cfg, + commit, + args.push, + args.dry_run, + logged_in=logged_in, + ) + + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..a86174a --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,33 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: # TODO: Ascend image is not ready yet + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source_env: PRIVATE_SDK_URL + +jobs: + nvidia_gpu: + image: latest + platform: nvidia + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + memory: 32GB + shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 + timeout: 3600 + + setup: pip install .[dev] + # env: # 可选,注入容器环境变量 + # MY_VAR: value + + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..66392eb --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,39 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..74ccfd1 --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,31 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..0c8d648 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import os +import shlex +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() + + +def build_results_dir(base, platform, stages, commit): + """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`.""" + stage_names = "+".join(s["name"] for s in stages) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + dirname = f"{platform}_{stage_names}_{commit}_{timestamp}" + + return Path(base) / dirname + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference to a full image name. + + Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config + contains a registry section, returns a registry-prefixed URL. Otherwise + returns a local tag (current default). + """ + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +set -e +cd /workspace +mkdir -p /workspace/results +git clone "$REPO_URL" repo +cd repo +git checkout "$BRANCH" +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + eval "$cmd" || failed=1 +done +echo "========== Summary ==========" +if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then + chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true +fi +exit $failed +""" + + +def build_docker_args( + config, + job_name, + repo_url, + branch, + stages, + workdir, + image_tag_override, + gpu_id_override=None, + results_dir=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "latest") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_raw = job.get("setup", "pip install .[dev]") + + if isinstance(setup_raw, list): + setup_cmd = "\n".join(setup_raw) + else: + setup_cmd = setup_raw + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + "-e", + f"HOST_UID={os.getuid()}", + "-e", + f"HOST_GID={os.getgid()}", + ] + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + args.extend(["-e", f"{proxy_var}={proxy_val}"]) + args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) + + for key, value in job.get("env", {}).items(): + args.extend(["-e", f"{key}={value}"]) + + if results_dir: + args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) + + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s['run']}") + + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) + gpu_count = resources.get("gpu_count", 0) + + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + + memory = resources.get("memory") + + if memory: + mem = str(memory).lower().replace("gb", "g").replace("mb", "m") + + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + + args.extend(["--memory", mem]) + + shm_size = resources.get("shm_size") + + if shm_size: + args.extend(["--shm-size", str(shm_size)]) + + timeout_sec = resources.get("timeout") + args.append(image) + + if timeout_sec: + # Requires coreutils `timeout` inside the container image. + args.extend(["timeout", str(timeout_sec)]) + + args.extend(["bash", "-c", build_runner_script().strip()]) + + return args + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument("--branch", type=str, help="Override repo branch") + parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + help="Base directory for test results (default: ./ci-results)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "master") + + jobs = config.get("jobs", {}) + + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + + job_name = args.job or next(iter(jobs)) + + if job_name not in jobs: + print(f"error: job {job_name!r} not in config", file=sys.stderr) + sys.exit(1) + + job = jobs[job_name] + all_stages = job.get("stages", []) + + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + + if not stages: + print(f"error: stage {args.stage!r} not found", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + platform = job.get("platform", "nvidia") + commit = get_git_commit() + results_dir = build_results_dir(args.results_dir, platform, stages, commit) + + workdir = "/workspace" + docker_args = build_docker_args( + config, + job_name, + repo_url, + branch, + stages, + workdir, + args.image_tag, + gpu_id_override=args.gpu_id, + results_dir=results_dir, + ) + + if args.dry_run: + print(shlex.join(docker_args)) + return + + results_dir.mkdir(parents=True, exist_ok=True) + sys.exit(subprocess.run(docker_args).returncode) + + +if __name__ == "__main__": + main() diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py new file mode 100644 index 0000000..98079cd --- /dev/null +++ b/.ci/tests/conftest.py @@ -0,0 +1,42 @@ +import sys +from pathlib import Path + +# Allow `import run` and `import build` directly. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import pytest + + +@pytest.fixture +def minimal_config(): + return { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "images": { + "nvidia": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + }, + "jobs": { + "nvidia_gpu": { + "image": "latest", + "platform": "nvidia", + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "setup": "pip install .[dev]", + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], + } + }, + } diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py new file mode 100644 index 0000000..fa2f292 --- /dev/null +++ b/.ci/tests/test_build.py @@ -0,0 +1,186 @@ +import build + + +# --------------------------------------------------------------------------- +# build_image_tag +# --------------------------------------------------------------------------- + + +def test_build_image_tag_with_registry(): + tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") + assert tag == "localhost:5000/infiniops/nvidia:latest" + + +def test_build_image_tag_without_registry(): + tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") + assert tag == "infiniops-ci/nvidia:abc1234" + + +def test_build_image_tag_commit_hash(): + tag = build.build_image_tag( + "registry.example.com:5000", "proj", "ascend", "deadbeef" + ) + assert tag == "registry.example.com:5000/proj/ascend:deadbeef" + + +# --------------------------------------------------------------------------- +# has_dockerfile_changed +# --------------------------------------------------------------------------- + + +def test_has_dockerfile_changed_true_when_stdout_nonempty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout="Dockerfile\n"), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +def test_has_dockerfile_changed_false_when_stdout_empty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is False + + +def test_has_dockerfile_changed_true_on_git_error(mocker): + # Shallow clone or initial commit: `git diff` returns non-zero. + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=128, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +# --------------------------------------------------------------------------- +# docker_login +# --------------------------------------------------------------------------- + + +def test_docker_login_no_credentials_env(mocker): + run_mock = mocker.patch("subprocess.run") + result = build.docker_login({"url": "localhost:5000"}, dry_run=False) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_token_not_set(mocker, monkeypatch, capsys): + monkeypatch.delenv("REGISTRY_TOKEN", raising=False) + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is False + run_mock.assert_not_called() + + +def test_docker_login_dry_run_does_not_call_subprocess(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=True) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_success(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is True + run_mock.assert_called_once() + cmd = run_mock.call_args[0][0] + assert "docker" in cmd + assert "login" in cmd + + +# --------------------------------------------------------------------------- +# build_image — dry_run and proxy +# --------------------------------------------------------------------------- + + +def _platform_cfg(): + return { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + + +def _registry_cfg(): + return {"url": "localhost:5000", "project": "infiniops"} + + +def test_build_image_dry_run_no_subprocess(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + run_mock = mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + run_mock.assert_not_called() + captured = capsys.readouterr() + assert "[dry-run]" in captured.out + + +def test_build_image_dry_run_output_contains_image_tag(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + captured = capsys.readouterr() + assert "abc1234" in captured.out + + +def test_build_image_proxy_in_build_args(mocker, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + called_cmd = run_mock.call_args[0][0] + joined = " ".join(called_cmd) + assert "HTTP_PROXY=http://proxy.test:3128" in joined + assert "http_proxy=http://proxy.test:3128" in joined + + +def test_build_image_returns_false_on_docker_error(mocker, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=1), + ) + result = build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + assert result is False diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py new file mode 100644 index 0000000..075546e --- /dev/null +++ b/.ci/tests/test_run.py @@ -0,0 +1,298 @@ +from pathlib import Path + +import pytest + +import run + + +# --------------------------------------------------------------------------- +# resolve_image +# --------------------------------------------------------------------------- + + +def test_resolve_image_with_registry(): + cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} + img = run.resolve_image(cfg, "nvidia", "latest") + assert img == "localhost:5000/infiniops/nvidia:latest" + + +def test_resolve_image_without_registry(minimal_config): + img = run.resolve_image(minimal_config, "nvidia", "abc1234") + assert img == "infiniops-ci/nvidia:abc1234" + + +# --------------------------------------------------------------------------- +# build_runner_script +# --------------------------------------------------------------------------- + + +def test_runner_script_contains_git_clone(): + script = run.build_runner_script() + assert "git clone" in script + + +def test_runner_script_contains_setup_cmd(): + script = run.build_runner_script() + assert "SETUP_CMD" in script + + +def test_runner_script_exits_on_failure(): + script = run.build_runner_script() + assert "exit $failed" in script + + +def test_runner_script_creates_results_dir(): + script = run.build_runner_script() + assert "mkdir -p /workspace/results" in script + + +# --------------------------------------------------------------------------- +# build_docker_args — basic structure +# --------------------------------------------------------------------------- + + +def test_docker_args_basic_structure(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert args[0] == "docker" + assert "run" in args + assert "--rm" in args + + +def test_docker_args_correct_image(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "infiniops-ci/nvidia:latest" in args + + +def test_docker_args_image_tag_override(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + "abc1234", + ) + assert "infiniops-ci/nvidia:abc1234" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — proxy passthrough +# --------------------------------------------------------------------------- + + +def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "-e" in args + assert "HTTP_PROXY=http://proxy.example.com:8080" in args + assert "http_proxy=http://proxy.example.com:8080" in args + + +def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("http_proxy", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.delenv("https_proxy", raising=False) + monkeypatch.delenv("NO_PROXY", raising=False) + monkeypatch.delenv("no_proxy", raising=False) + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + + for arg in args: + assert not arg.startswith("HTTP_PROXY=") + assert not arg.startswith("http_proxy=") + assert not arg.startswith("HTTPS_PROXY=") + assert not arg.startswith("https_proxy=") + assert not arg.startswith("NO_PROXY=") + assert not arg.startswith("no_proxy=") + + +def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "HTTP_PROXY=http://lowercase.proxy:3128" in args + assert "http_proxy=http://lowercase.proxy:3128" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — GPU flags +# --------------------------------------------------------------------------- + + +def _make_args(config, gpu_id_override=None): + return run.build_docker_args( + config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + gpu_id_override=gpu_id_override, + ) + + +def test_docker_args_gpu_device(minimal_config): + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert "device=0" in args[idx + 1] + + +def test_docker_args_gpu_all(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert args[idx + 1] == "all" + + +def test_docker_args_no_gpu(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "" + minimal_config["jobs"]["nvidia_gpu"]["resources"].pop("gpu_count", None) + args = _make_args(minimal_config) + assert "--gpus" not in args + + +def test_docker_args_gpu_override(minimal_config): + args = _make_args(minimal_config, gpu_id_override="2,3") + idx = args.index("--gpus") + assert "2,3" in args[idx + 1] + + +# --------------------------------------------------------------------------- +# build_docker_args — memory format +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("32GB", "32g"), + ("512MB", "512m"), + ("8", "8g"), + ("16gb", "16g"), + ("256mb", "256m"), + ], +) +def test_docker_args_memory_format(minimal_config, raw, expected): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw + args = _make_args(minimal_config) + idx = args.index("--memory") + assert args[idx + 1] == expected + + +# --------------------------------------------------------------------------- +# build_docker_args — stages encoding +# --------------------------------------------------------------------------- + + +def test_docker_args_num_stages(minimal_config): + args = _make_args(minimal_config) + assert "NUM_STAGES=1" in args + + +def test_docker_args_stage_name_cmd(minimal_config): + args = _make_args(minimal_config) + assert "STAGE_1_NAME=test" in args + assert any(a.startswith("STAGE_1_CMD=") for a in args) + + +def test_docker_args_multiple_stages(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ + {"name": "lint", "run": "ruff check ."}, + {"name": "test", "run": "pytest tests/"}, + ] + args = _make_args(minimal_config) + assert "NUM_STAGES=2" in args + assert "STAGE_1_NAME=lint" in args + assert "STAGE_2_NAME=test" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — results_dir mount +# --------------------------------------------------------------------------- + + +def test_docker_args_results_dir(minimal_config, tmp_path): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + results_dir=tmp_path, + ) + joined = " ".join(str(a) for a in args) + assert "-v" in args + assert "/workspace/results" in joined + + +# --------------------------------------------------------------------------- +# build_results_dir +# --------------------------------------------------------------------------- + + +def test_build_results_dir_contains_platform(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "nvidia" in d.name + + +def test_build_results_dir_contains_commit(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "abc1234" in d.name + + +def test_build_results_dir_contains_stage_names(): + stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "lint+test" in d.name + + +def test_build_results_dir_under_base(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") + assert d.parent == Path("/tmp/my-results") diff --git a/pyproject.toml b/pyproject.toml index 765b90a..3dbc186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] install-dir = "infini" diff --git a/tests/test_add.py b/tests/test_add.py index 1c98d91..61d6715 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -4,15 +4,39 @@ from tests.utils import Payload, empty_strided, randint_strided, randn_strided -_INT_DTYPES = ( - torch.int16, - torch.uint16, - torch.int32, - torch.uint32, - torch.int64, - torch.uint64, +_INT_DTYPES = tuple( + d + for d in ( + torch.int16, + torch.int32, + torch.int64, + ) + if d is not None ) +_UINT_DTYPES = tuple( + d + for d in ( + getattr(torch, "uint16", None), + getattr(torch, "uint32", None), + getattr(torch, "uint64", None), + ) + if d is not None +) + +def _dtype_parametrize(): + candidates = [ + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + (torch.int16, 0, 0), + (torch.int32, 0, 0), + (getattr(torch, "uint32", None), 0, 0), + (torch.int64, 0, 0), + (getattr(torch, "uint64", None), 0, 0), + ] + return tuple((d, r, a) for (d, r, a) in candidates if d is not None) + @pytest.mark.auto_act_and_assert @pytest.mark.parametrize( @@ -32,30 +56,11 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -@pytest.mark.parametrize( - ("dtype", "rtol", "atol"), - ( - (torch.float32, 1e-7, 1e-7), - (torch.float16, 1e-3, 1e-3), - (torch.bfloat16, 1e-2, 5e-3), - (torch.int16, 0, 0), - (torch.uint16, 0, 0), - (torch.int32, 0, 0), - (torch.uint32, 0, 0), - (torch.int64, 0, 0), - (torch.uint64, 0, 0), - ), -) -def test_add( - shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol -): - if dtype in _INT_DTYPES: - input = randint_strided( - 0, 100, shape, input_strides, dtype=dtype, device=device - ) - other = randint_strided( - 0, 100, shape, other_strides, dtype=dtype, device=device - ) +@pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) +def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: + input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device) + other = randint_strided(0, 100, shape, other_strides, dtype=dtype, device=device) else: input = randn_strided(shape, input_strides, dtype=dtype, device=device) other = randn_strided(shape, other_strides, dtype=dtype, device=device) @@ -72,10 +77,10 @@ def _add(input, other, out): def _torch_add(input, other, out): - if input.dtype in (torch.uint16, torch.uint32, torch.uint64): + if input.dtype in _UINT_DTYPES: input = input.to(torch.int64) - if other.dtype in (torch.uint16, torch.uint32, torch.uint64): + if other.dtype in _UINT_DTYPES: other = other.to(torch.int64) res = torch.add(input, other) diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py index f447091..b0c9c5d 100644 --- a/tests/test_rms_norm.py +++ b/tests/test_rms_norm.py @@ -59,4 +59,13 @@ def _rms_norm(input, weight, *, eps=1e-6, out=None): def _torch_rms_norm(input, weight, *, eps=1e-6, out=None): - return torch.nn.functional.rms_norm(input, input.shape[-1:], weight=weight, eps=eps) + rms_norm_fn = getattr(torch.nn.functional, "rms_norm", None) + if rms_norm_fn is not None: + return rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps) + # Fallback for PyTorch < 2.3: RMS norm = (x / sqrt(mean(x^2) + eps)) * weight + rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps) + result = (input / rms) * weight + if out is not None: + out.copy_(result) + return out + return result