From ce2bebaa2f12cfcc043d13c2c35d9f8cdac43c9f Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 00:34:39 +0300 Subject: [PATCH 1/6] Add HF dataset export logic and admin commands --- src/kernelbot/api/main.py | 45 ++++++ src/kernelbot/cogs/admin_cog.py | 100 ++++++++++++++ src/kernelbot/env.py | 3 + src/libkernelbot/hf_export.py | 235 ++++++++++++++++++++++++++++++++ 4 files changed, 383 insertions(+) create mode 100644 src/libkernelbot/hf_export.py diff --git a/src/kernelbot/api/main.py b/src/kernelbot/api/main.py index 960d11ff..92f45450 100644 --- a/src/kernelbot/api/main.py +++ b/src/kernelbot/api/main.py @@ -675,6 +675,51 @@ async def admin_update_problems( } +@app.post("/admin/export-hf") +async def admin_export_hf( + payload: dict, + _: Annotated[None, Depends(require_admin)], + db_context=Depends(get_db), +) -> dict: + """Export competition submissions to a Hugging Face dataset as parquet. + + Payload: + leaderboard_ids: list[int] - IDs of leaderboards to export + filename: str - parquet filename in the repo (e.g. "nvidia_nvfp4_submissions.parquet") + private: bool - if true, upload to private live repo; if false, upload to public repo (default: true) + """ + from libkernelbot.hf_export import export_to_hf + + leaderboard_ids = payload.get("leaderboard_ids") + filename = payload.get("filename") + private = payload.get("private", True) + + if not leaderboard_ids or not isinstance(leaderboard_ids, list): + raise HTTPException(status_code=400, detail="leaderboard_ids must be a non-empty list of integers") + if not filename or not filename.endswith(".parquet"): + raise HTTPException(status_code=400, detail="filename must end with .parquet") + if not env.HF_TOKEN: + raise HTTPException(status_code=500, detail="HF_TOKEN not configured") + + repo_id = env.HF_PUBLIC_DATASET if not private else env.HF_PRIVATE_DATASET + + try: + with db_context as db: + result = export_to_hf( + db=db, + leaderboard_ids=leaderboard_ids, + repo_id=repo_id, + filename=filename, + token=env.HF_TOKEN, + private=private, + ) + return {"status": "ok", **result} + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=500, detail=f"Export failed: {e}") from e + + @app.get("/leaderboards") async def get_leaderboards(db_context=Depends(get_db)): """An endpoint that returns all leaderboards. diff --git a/src/kernelbot/cogs/admin_cog.py b/src/kernelbot/cogs/admin_cog.py index c9e2e9f0..81da4cf5 100644 --- a/src/kernelbot/cogs/admin_cog.py +++ b/src/kernelbot/cogs/admin_cog.py @@ -123,7 +123,13 @@ def __init__(self, bot: "ClusterBot"): name="set-forum-ids", description="Sets forum IDs" )(self.set_forum_ids) + self.export_to_hf = bot.admin_group.command( + name="export-hf", description="Export competition data to Hugging Face dataset" + )(self.export_to_hf) + self._scheduled_cleanup_temp_users.start() + if env.HF_TOKEN: + self._scheduled_hf_export.start() # -------------------------------------------------------------------------- # | HELPER FUNCTIONS | @@ -873,6 +879,100 @@ async def _scheduled_cleanup_temp_users(self): db.cleanup_temp_users() logger.info("Temporary users cleanup completed") + @tasks.loop(hours=24) + async def _scheduled_hf_export(self): + """Daily export of active competition submissions to private HF dataset.""" + from libkernelbot.hf_export import export_to_hf, get_active_competition_leaderboards + + try: + with self.bot.leaderboard_db as db: + leaderboards = db.get_leaderboards() + active = get_active_competition_leaderboards( + leaderboards, + now=datetime.now(timezone.utc), + ) + + if not active: + logger.info("HF export: no active competitions, skipping") + return + + leaderboard_ids = [lb["id"] for lb in active] + result = export_to_hf( + db=db, + leaderboard_ids=leaderboard_ids, + repo_id=env.HF_PRIVATE_DATASET, + filename="active_submissions.parquet", + token=env.HF_TOKEN, + private=True, + ) + logger.info("Scheduled HF export complete: %s", result) + except Exception: + logger.exception("Scheduled HF export failed") + + @_scheduled_hf_export.before_loop + async def _before_hf_export(self): + await self.bot.wait_until_ready() + + @discord.app_commands.describe( + leaderboard_name="Name of the competition to export", + filename="Parquet filename (default: .parquet)", + private="Upload to private repo (default: true)", + ) + @discord.app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def export_to_hf( + self, + interaction: discord.Interaction, + leaderboard_name: str, + filename: Optional[str] = None, + private: bool = True, + ): + from libkernelbot.hf_export import export_to_hf as do_export + + is_admin = await self.admin_check(interaction) + if not is_admin: + await send_discord_message( + interaction, + "You need to have Admin permissions to run this command", + ephemeral=True, + ) + return + + if not env.HF_TOKEN: + await send_discord_message(interaction, "HF_TOKEN not configured.", ephemeral=True) + return + + await interaction.response.defer(ephemeral=True) + + if filename is None: + filename = f"{leaderboard_name}.parquet" + if not filename.endswith(".parquet"): + filename += ".parquet" + + repo_id = env.HF_PRIVATE_DATASET if private else env.HF_PUBLIC_DATASET + + try: + with self.bot.leaderboard_db as db: + lb_id = db.get_leaderboard_id(leaderboard_name) + result = do_export( + db=db, + leaderboard_ids=[lb_id], + repo_id=repo_id, + filename=filename, + token=env.HF_TOKEN, + private=private, + ) + await send_discord_message( + interaction, + f"Exported {result['rows']} rows to `{repo_id}/{filename}`.", + ephemeral=True, + ) + except ValueError as e: + await send_discord_message(interaction, str(e), ephemeral=True) + except Exception as e: + logger.error("HF export failed: %s", e, exc_info=True) + await send_discord_message(interaction, f"Export failed: {e}", ephemeral=True) + #################################################################################################################### # MIGRATION COMMANDS --- TO BE DELETED LATER #################################################################################################################### diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index 90dd276c..0c3d11f1 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -17,6 +17,9 @@ env.DISCORD_DEBUG_CLUSTER_STAGING_ID = os.getenv("DISCORD_DEBUG_CLUSTER_STAGING_ID") env.ADMIN_TOKEN = os.getenv("ADMIN_TOKEN") +env.HF_TOKEN = os.getenv("HF_TOKEN") +env.HF_PRIVATE_DATASET = os.getenv("HF_PRIVATE_DATASET", "GPUMODE/kernelbot-data-live") +env.HF_PUBLIC_DATASET = os.getenv("HF_PUBLIC_DATASET", "GPUMODE/kernelbot-data") # Only required to run the CLI against this instance # setting these is required only to run the CLI against local instance diff --git a/src/libkernelbot/hf_export.py b/src/libkernelbot/hf_export.py new file mode 100644 index 00000000..e68d75b9 --- /dev/null +++ b/src/libkernelbot/hf_export.py @@ -0,0 +1,235 @@ +"""Export competition submissions to Hugging Face datasets as parquet files.""" + +import io +import tempfile +from datetime import datetime, timezone + +import pyarrow as pa +import pyarrow.parquet as pq +from huggingface_hub import HfApi + +from libkernelbot.leaderboard_db import LeaderboardDB +from libkernelbot.utils import setup_logging + +logger = setup_logging(__name__) + +# Explicit schema matching GPUMODE/kernelbot-data nvidia_nvfp4_submissions.parquet +SUBMISSIONS_SCHEMA = pa.schema([ + ("submission_id", pa.int64()), + ("leaderboard_id", pa.int64()), + ("problem_name", pa.large_string()), + ("user_id", pa.large_string()), + ("user_name", pa.large_string()), + ("code_id", pa.int64()), + ("file_name", pa.large_string()), + ("submission_time", pa.timestamp("us", tz="UTC")), + ("status", pa.large_string()), + ("score", pa.float64()), + ("passed", pa.bool_()), + ("mode", pa.large_string()), + ("runner", pa.large_string()), + ("code", pa.large_string()), +]) + + +def _normalize_deadline(deadline: datetime) -> datetime: + """Ensure deadlines are timezone-aware before comparing them.""" + if deadline.tzinfo is None: + return deadline.replace(tzinfo=timezone.utc) + return deadline + + +MAX_COMPETITION_HORIZON_DAYS = 365 + + +def get_active_competition_leaderboards( + leaderboards: list[dict], + *, + now: datetime | None = None, +) -> list[dict]: + """Return leaderboards that belong to real, active competitions. + + Filters out: + - Expired leaderboards (deadline <= now) + - Dev leaderboards (name ending with "-dev") + - Permanent/practice leaderboards (deadline > 1 year from now, e.g. year 2100) + """ + if now is None: + now = datetime.now(timezone.utc) + + from datetime import timedelta + + horizon = now + timedelta(days=MAX_COMPETITION_HORIZON_DAYS) + + active_competitions = [] + for leaderboard in leaderboards: + deadline = _normalize_deadline(leaderboard["deadline"]) + if deadline > now and deadline < horizon and not leaderboard["name"].endswith("-dev"): + active_competitions.append(leaderboard) + return active_competitions + + +def ensure_public_export_allowed( + db: LeaderboardDB, + leaderboard_ids: list[int], + *, + now: datetime | None = None, +) -> None: + """Block public exports while any selected leaderboard is still active.""" + if now is None: + now = datetime.now(timezone.utc) + + selected_ids = set(leaderboard_ids) + active_names = [] + for leaderboard in db.get_leaderboards(): + if leaderboard["id"] not in selected_ids: + continue + deadline = _normalize_deadline(leaderboard["deadline"]) + if deadline > now: + active_names.append(leaderboard["name"]) + + if active_names: + active_names.sort() + raise ValueError( + "Cannot export active leaderboards to the public dataset: " + + ", ".join(active_names) + ) + + +def get_hf_export_rows(db: LeaderboardDB, leaderboard_ids: list[int]) -> list[dict]: + """Fetch deduplicated submissions for export. + + Deduplicates by (leaderboard_id, user_id, code_id, runner), keeping the + fastest score. Excludes secret runs. + """ + if not leaderboard_ids: + return [] + + db.cursor.execute( + """ + WITH ranked AS ( + SELECT + s.id as submission_id, + s.leaderboard_id, + l.name as problem_name, + s.user_id, + u.user_name, + s.code_id, + s.file_name, + s.submission_time, + COALESCE( + sjs.status, + CASE + WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded' + WHEN s.done THEN 'failed' + ELSE s.status + END + ) as status, + r.score, + r.passed, + r.mode, + r.runner, + COALESCE(c.old_code, convert_from(c.code, 'UTF8')) as code, + ROW_NUMBER() OVER ( + PARTITION BY s.leaderboard_id, s.user_id, s.code_id, r.runner + ORDER BY r.score ASC NULLS LAST, s.submission_time ASC + ) as rn + FROM leaderboard.submission s + JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id + LEFT JOIN leaderboard.user_info u ON s.user_id = u.id + LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id + LEFT JOIN leaderboard.runs r + ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret + LEFT JOIN leaderboard.code_files c ON s.code_id = c.id + WHERE s.leaderboard_id = ANY(%s) + ) + SELECT + submission_id, leaderboard_id, problem_name, user_id, user_name, + code_id, file_name, submission_time, status, score, passed, mode, + runner, code + FROM ranked + WHERE rn = 1 + ORDER BY problem_name, score ASC NULLS LAST + """, + (leaderboard_ids,), + ) + + columns = [ + "submission_id", "leaderboard_id", "problem_name", "user_id", "user_name", + "code_id", "file_name", "submission_time", "status", "score", "passed", + "mode", "runner", "code", + ] + return [dict(zip(columns, row, strict=True)) for row in db.cursor.fetchall()] + + +def rows_to_parquet_bytes(rows: list[dict]) -> bytes: + """Convert a list of row dicts to parquet bytes using the canonical schema.""" + if not rows: + table = pa.table({field.name: pa.array([], type=field.type) for field in SUBMISSIONS_SCHEMA}) + else: + for row in rows: + if row.get("user_id") is not None: + row["user_id"] = str(row["user_id"]) + if row.get("user_name") is None: + row["user_name"] = "" + if row.get("score") is not None: + row["score"] = float(row["score"]) + table = pa.Table.from_pylist(rows, schema=SUBMISSIONS_SCHEMA) + + buf = io.BytesIO() + pq.write_table(table, buf, compression="snappy") + return buf.getvalue() + + +def export_to_hf( + db: LeaderboardDB, + leaderboard_ids: list[int], + repo_id: str, + filename: str, + token: str, + private: bool = True, +) -> dict: + """Export deduplicated submissions to a HF dataset repo as a parquet file. + + Returns a summary dict with row count and repo info. + """ + if not private: + ensure_public_export_allowed(db, leaderboard_ids) + + api = HfApi(token=token) + api.create_repo(repo_id, repo_type="dataset", private=private, exist_ok=True) + + rows = get_hf_export_rows(db, leaderboard_ids) + parquet_bytes = rows_to_parquet_bytes(rows) + + with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp: + tmp.write(parquet_bytes) + tmp.flush() + api.upload_file( + path_or_fileobj=tmp.name, + path_in_repo=filename, + repo_id=repo_id, + repo_type="dataset", + ) + + logger.info("Exported %d rows to %s/%s", len(rows), repo_id, filename) + return {"rows": len(rows), "repo_id": repo_id, "filename": filename} + + +def publish_to_public_repo( + db: LeaderboardDB, + leaderboard_ids: list[int], + private_repo_id: str, + public_repo_id: str, + filename: str, + token: str, +) -> dict: + """Export final competition data to the public dataset repo.""" + return export_to_hf( + db=db, + leaderboard_ids=leaderboard_ids, + repo_id=public_repo_id, + filename=filename, + token=token, + private=False, + ) From d90a689110d917ec9fce71bab2854706dc35fcdb Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 00:34:45 +0300 Subject: [PATCH 2/6] Add huggingface-hub and datasets dependencies --- pyproject.toml | 2 + uv.lock | 134 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 20d4fa16..7bc8ee3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,8 @@ dependencies = [ "fastapi[all]", "uvicorn", "jinja2", + "huggingface-hub>=0.20", + "pyarrow>=14.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 8cf1b038..b31b6099 100644 --- a/uv.lock +++ b/uv.lock @@ -615,9 +615,11 @@ dependencies = [ { name = "better-profanity" }, { name = "discord-py" }, { name = "fastapi", extra = ["all"] }, + { name = "huggingface-hub" }, { name = "jinja2" }, { name = "modal" }, { name = "psycopg2-binary" }, + { name = "pyarrow" }, { name = "pygithub" }, { name = "python-dotenv" }, { name = "pyyaml" }, @@ -642,10 +644,12 @@ requires-dist = [ { name = "better-profanity" }, { name = "discord-py" }, { name = "fastapi", extras = ["all"] }, + { name = "huggingface-hub", specifier = ">=0.20" }, { name = "jinja2" }, { name = "modal" }, { name = "pre-commit", marker = "extra == 'dev'" }, { name = "psycopg2-binary" }, + { name = "pyarrow", specifier = ">=14.0" }, { name = "pygithub" }, { name = "pytest", marker = "extra == 'dev'" }, { name = "pytest-asyncio", marker = "extra == 'dev'" }, @@ -887,6 +891,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, ] +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + [[package]] name = "grpclib" version = "0.4.8" @@ -922,6 +935,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, ] +[[package]] +name = "hf-xet" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/75/462285971954269432aad2e7938c5c7ff9ec7d60129cec542ab37121e3d6/hf_xet-1.3.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:335a8f36c55fd35a92d0062f4e9201b4015057e62747b7e7001ffb203c0ee1d2", size = 3761019, upload-time = "2026-02-27T17:25:49.441Z" }, + { url = "https://files.pythonhosted.org/packages/35/56/987b0537ddaf88e17192ea09afa8eca853e55f39a4721578be436f8409df/hf_xet-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c1ae4d3a716afc774e66922f3cac8206bfa707db13f6a7e62dfff74bfc95c9a8", size = 3521565, upload-time = "2026-02-27T17:25:47.469Z" }, + { url = "https://files.pythonhosted.org/packages/a8/5c/7e4a33a3d689f77761156cc34558047569e54af92e4d15a8f493229f6767/hf_xet-1.3.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d6dbdf231efac0b9b39adcf12a07f0c030498f9212a18e8c50224d0e84ab803d", size = 4176494, upload-time = "2026-02-27T17:25:40.247Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b3/71e856bf9d9a69b3931837e8bf22e095775f268c8edcd4a9e8c355f92484/hf_xet-1.3.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c1980abfb68ecf6c1c7983379ed7b1e2b49a1aaf1a5aca9acc7d48e5e2e0a961", size = 3955601, upload-time = "2026-02-27T17:25:38.376Z" }, + { url = "https://files.pythonhosted.org/packages/63/d7/aecf97b3f0a981600a67ff4db15e2d433389d698a284bb0ea5d8fcdd6f7f/hf_xet-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1c88fbd90ad0d27c46b77a445f0a436ebaa94e14965c581123b68b1c52f5fd30", size = 4154770, upload-time = "2026-02-27T17:25:56.756Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e1/3af961f71a40e09bf5ee909842127b6b00f5ab4ee3817599dc0771b79893/hf_xet-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:35b855024ca37f2dd113ac1c08993e997fbe167b9d61f9ef66d3d4f84015e508", size = 4394161, upload-time = "2026-02-27T17:25:58.111Z" }, + { url = "https://files.pythonhosted.org/packages/a1/c3/859509bade9178e21b8b1db867b8e10e9f817ab9ac1de77cb9f461ced765/hf_xet-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:31612ba0629046e425ba50375685a2586e11fb9144270ebabd75878c3eaf6378", size = 3637377, upload-time = "2026-02-27T17:26:10.611Z" }, + { url = "https://files.pythonhosted.org/packages/05/7f/724cfbef4da92d577b71f68bf832961c8919f36c60d28d289a9fc9d024d4/hf_xet-1.3.2-cp313-cp313t-win_arm64.whl", hash = "sha256:433c77c9f4e132b562f37d66c9b22c05b5479f243a1f06a120c1c06ce8b1502a", size = 3497875, upload-time = "2026-02-27T17:26:09.034Z" }, + { url = "https://files.pythonhosted.org/packages/ba/75/9d54c1ae1d05fb704f977eca1671747babf1957f19f38ae75c5933bc2dc1/hf_xet-1.3.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:c34e2c7aefad15792d57067c1c89b2b02c1bbaeabd7f8456ae3d07b4bbaf4094", size = 3761076, upload-time = "2026-02-27T17:25:55.42Z" }, + { url = "https://files.pythonhosted.org/packages/f2/8a/08a24b6c6f52b5d26848c16e4b6d790bb810d1bf62c3505bed179f7032d3/hf_xet-1.3.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:4bc995d6c41992831f762096020dc14a65fdf3963f86ffed580b596d04de32e3", size = 3521745, upload-time = "2026-02-27T17:25:54.217Z" }, + { url = "https://files.pythonhosted.org/packages/b5/db/a75cf400dd8a1a8acf226a12955ff6ee999f272dfc0505bafd8079a61267/hf_xet-1.3.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:959083c89dee30f7d6f890b36cdadda823386c4de63b1a30384a75bfd2ae995d", size = 4176301, upload-time = "2026-02-27T17:25:46.044Z" }, + { url = "https://files.pythonhosted.org/packages/01/40/6c4c798ffdd83e740dd3925c4e47793b07442a9efa3bc3866ba141a82365/hf_xet-1.3.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:cfa760888633b08c01b398d212ce7e8c0d7adac6c86e4b20dfb2397d8acd78ee", size = 3955437, upload-time = "2026-02-27T17:25:44.703Z" }, + { url = "https://files.pythonhosted.org/packages/0c/09/9a3aa7c5f07d3e5cc57bb750d12a124ffa72c273a87164bd848f9ac5cc14/hf_xet-1.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3155a02e083aa21fd733a7485c7c36025e49d5975c8d6bda0453d224dd0b0ac4", size = 4154535, upload-time = "2026-02-27T17:26:05.207Z" }, + { url = "https://files.pythonhosted.org/packages/ae/e0/831f7fa6d90cb47a230bc23284b502c700e1483bbe459437b3844cdc0776/hf_xet-1.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:91b1dc03c31cbf733d35dc03df7c5353686233d86af045e716f1e0ea4a2673cf", size = 4393891, upload-time = "2026-02-27T17:26:06.607Z" }, + { url = "https://files.pythonhosted.org/packages/ab/96/6ed472fdce7f8b70f5da6e3f05be76816a610063003bfd6d9cea0bbb58a3/hf_xet-1.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:211f30098512d95e85ad03ae63bd7dd2c4df476558a5095d09f9e38e78cbf674", size = 3637583, upload-time = "2026-02-27T17:26:17.349Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e8/a069edc4570b3f8e123c0b80fadc94530f3d7b01394e1fc1bb223339366c/hf_xet-1.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:4a6817c41de7c48ed9270da0b02849347e089c5ece9a0e72ae4f4b3a57617f82", size = 3497977, upload-time = "2026-02-27T17:26:14.966Z" }, + { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, + { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, + { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, +] + [[package]] name = "hpack" version = "4.1.0" @@ -995,6 +1040,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "huggingface-hub" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "tqdm" }, + { name = "typer" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/7a/304cec37112382c4fe29a43bcb0d5891f922785d18745883d2aa4eb74e4b/huggingface_hub-1.6.0.tar.gz", hash = "sha256:d931ddad8ba8dfc1e816bf254810eb6f38e5c32f60d4184b5885662a3b167325", size = 717071, upload-time = "2026-03-06T14:19:18.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/e3/e3a44f54c8e2f28983fcf07f13d4260b37bd6a0d3a081041bc60b91d230e/huggingface_hub-1.6.0-py3-none-any.whl", hash = "sha256:ef40e2d5cb85e48b2c067020fa5142168342d5108a1b267478ed384ecbf18961", size = 612874, upload-time = "2026-03-06T14:19:16.844Z" }, +] + [[package]] name = "hyperframe" version = "6.1.0" @@ -1561,6 +1626,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/08/50/d13ea0a054189ae1bc21af1d85b6f8bb9bbc5572991055d70ad9006fe2d6/psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142", size = 2569224, upload-time = "2025-01-04T20:09:19.234Z" }, ] +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a8/24e5dc6855f50a62936ceb004e6e9645e4219a8065f304145d7fb8a79d5d/pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56", size = 34307390, upload-time = "2026-02-16T10:08:08.654Z" }, + { url = "https://files.pythonhosted.org/packages/bc/8e/4be5617b4aaae0287f621ad31c6036e5f63118cfca0dc57d42121ff49b51/pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c", size = 35853761, upload-time = "2026-02-16T10:08:17.811Z" }, + { url = "https://files.pythonhosted.org/packages/2e/08/3e56a18819462210432ae37d10f5c8eed3828be1d6c751b6e6a2e93c286a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d0744403adabef53c985a7f8a082b502a368510c40d184df349a0a8754533258", size = 44493116, upload-time = "2026-02-16T10:08:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/f8/82/c40b68001dbec8a3faa4c08cd8c200798ac732d2854537c5449dc859f55a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c33b5bf406284fd0bba436ed6f6c3ebe8e311722b441d89397c54f871c6863a2", size = 47564532, upload-time = "2026-02-16T10:08:34.27Z" }, + { url = "https://files.pythonhosted.org/packages/20/bc/73f611989116b6f53347581b02177f9f620efdf3cd3f405d0e83cdf53a83/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ddf743e82f69dcd6dbbcb63628895d7161e04e56794ef80550ac6f3315eeb1d5", size = 48183685, upload-time = "2026-02-16T10:08:42.889Z" }, + { url = "https://files.pythonhosted.org/packages/b0/cc/6c6b3ecdae2a8c3aced99956187e8302fc954cc2cca2a37cf2111dad16ce/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e052a211c5ac9848ae15d5ec875ed0943c0221e2fcfe69eee80b604b4e703222", size = 50605582, upload-time = "2026-02-16T10:08:51.641Z" }, + { url = "https://files.pythonhosted.org/packages/8d/94/d359e708672878d7638a04a0448edf7c707f9e5606cee11e15aaa5c7535a/pyarrow-23.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:5abde149bb3ce524782d838eb67ac095cd3fd6090eba051130589793f1a7f76d", size = 27521148, upload-time = "2026-02-16T10:08:58.077Z" }, + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, + { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, + { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, + { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, + { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, + { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, + { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, + { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, + { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, + { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, + { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, + { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, + { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, + { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, + { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, + { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, + { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, + { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, + { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, + { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, + { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, + { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, +] + [[package]] name = "pycparser" version = "2.22" @@ -2191,6 +2313,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + [[package]] name = "typer" version = "0.16.0" From f024428b59e7fae69879e0281208345306679518 Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 00:34:50 +0300 Subject: [PATCH 3/6] Add tests for HF export and admin API endpoint --- tests/test_admin_api.py | 31 +++++ tests/test_hf_export.py | 248 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100644 tests/test_hf_export.py diff --git a/tests/test_admin_api.py b/tests/test_admin_api.py index fa4b6751..40fc201a 100644 --- a/tests/test_admin_api.py +++ b/tests/test_admin_api.py @@ -425,3 +425,34 @@ def test_update_problems_with_errors(self, test_client, mock_backend): assert data["status"] == "ok" assert len(data["errors"]) == 1 assert data["errors"][0]["name"] == "bad-problem" + + +class TestAdminExportHF: + """Test admin HF export endpoint.""" + + def test_export_hf_rejects_active_public_export(self, test_client, mock_backend): + """POST /admin/export-hf returns 400 for active public exports.""" + from kernelbot.api import main as api_main + + mock_backend.db.__enter__ = MagicMock(return_value=mock_backend.db) + mock_backend.db.__exit__ = MagicMock(return_value=None) + + with patch.object(api_main.env, "HF_TOKEN", "hf-token"): + with patch( + "libkernelbot.hf_export.export_to_hf", + side_effect=ValueError( + "Cannot export active leaderboards to the public dataset: active-comp" + ), + ): + response = test_client.post( + "/admin/export-hf", + headers={"Authorization": "Bearer test_token"}, + json={ + "leaderboard_ids": [1], + "filename": "active-comp.parquet", + "private": False, + }, + ) + + assert response.status_code == 400 + assert "Cannot export active leaderboards" in response.json()["detail"] diff --git a/tests/test_hf_export.py b/tests/test_hf_export.py new file mode 100644 index 00000000..695f7760 --- /dev/null +++ b/tests/test_hf_export.py @@ -0,0 +1,248 @@ +"""Tests for HF export module.""" + +import io +from datetime import datetime, timedelta, timezone +from decimal import Decimal +from unittest.mock import MagicMock + +import pyarrow.parquet as pq +import pytest + +from libkernelbot.hf_export import ( + SUBMISSIONS_SCHEMA, + ensure_public_export_allowed, + get_active_competition_leaderboards, + get_hf_export_rows, + rows_to_parquet_bytes, +) + +NOW = datetime(2026, 3, 8, 12, 0, 0, tzinfo=timezone.utc) + + +def _lb(id, name, deadline): + return {"id": id, "name": name, "deadline": deadline} + + +def _row(submission_id=1, score=0.001, user_id="123", user_name="alice", **overrides): + base = { + "submission_id": submission_id, + "leaderboard_id": 763, + "problem_name": "amd-mxfp4-mm", + "user_id": user_id, + "user_name": user_name, + "code_id": 100, + "file_name": "submission.py", + "submission_time": NOW, + "status": "active", + "score": score, + "passed": True, + "mode": "leaderboard", + "runner": "MI355X", + "code": "import torch", + } + base.update(overrides) + return base + + +# --------------------------------------------------------------------------- +# get_active_competition_leaderboards +# --------------------------------------------------------------------------- + +class TestGetActiveCompetitionLeaderboards: + def test_active_competition_included(self): + lbs = [_lb(763, "amd-mxfp4-mm", NOW + timedelta(days=22))] + result = get_active_competition_leaderboards(lbs, now=NOW) + assert len(result) == 1 + assert result[0]["id"] == 763 + + def test_expired_excluded(self): + lbs = [_lb(595, "nvfp4_gemv", NOW - timedelta(days=1))] + assert get_active_competition_leaderboards(lbs, now=NOW) == [] + + def test_dev_excluded(self): + lbs = [_lb(365, "grayscale_py_b200-dev", NOW + timedelta(days=8))] + assert get_active_competition_leaderboards(lbs, now=NOW) == [] + + def test_permanent_year_2100_excluded(self): + lbs = [_lb(537, "conv2d_v2", datetime(2100, 12, 31, tzinfo=timezone.utc))] + assert get_active_competition_leaderboards(lbs, now=NOW) == [] + + def test_mixed_real_data(self): + lbs = [ + _lb(763, "amd-mxfp4-mm", NOW + timedelta(days=22)), + _lb(764, "amd-moe-mxfp4", NOW + timedelta(days=22)), + _lb(537, "conv2d_v2", datetime(2100, 12, 31, tzinfo=timezone.utc)), + _lb(365, "grayscale_py_b200-dev", NOW + timedelta(days=8)), + _lb(595, "nvfp4_gemv", NOW - timedelta(days=100)), + ] + result = get_active_competition_leaderboards(lbs, now=NOW) + assert [lb["id"] for lb in result] == [763, 764] + + def test_naive_deadline_treated_as_utc(self): + naive = datetime(2026, 3, 30, 6, 0, 0) + lbs = [_lb(763, "amd-mxfp4-mm", naive)] + assert len(get_active_competition_leaderboards(lbs, now=NOW)) == 1 + + def test_empty_input(self): + assert get_active_competition_leaderboards([], now=NOW) == [] + + def test_deadline_exactly_at_horizon_excluded(self): + lbs = [_lb(1, "edge", NOW + timedelta(days=365))] + assert get_active_competition_leaderboards(lbs, now=NOW) == [] + + def test_deadline_just_inside_horizon(self): + lbs = [_lb(1, "edge", NOW + timedelta(days=364))] + assert len(get_active_competition_leaderboards(lbs, now=NOW)) == 1 + + +# --------------------------------------------------------------------------- +# rows_to_parquet_bytes +# --------------------------------------------------------------------------- + +class TestRowsToParquetBytes: + def test_schema_matches_target(self): + data = rows_to_parquet_bytes([_row()]) + table = pq.read_table(io.BytesIO(data)) + assert table.schema.equals(SUBMISSIONS_SCHEMA) + + def test_roundtrip_values(self): + data = rows_to_parquet_bytes([_row(submission_id=42, score=0.005, user_name="bob")]) + table = pq.read_table(io.BytesIO(data)) + assert table.num_rows == 1 + assert table.column("submission_id")[0].as_py() == 42 + assert table.column("user_name")[0].as_py() == "bob" + assert abs(table.column("score")[0].as_py() - 0.005) < 1e-10 + + def test_empty_rows_valid_parquet(self): + data = rows_to_parquet_bytes([]) + table = pq.read_table(io.BytesIO(data)) + assert table.num_rows == 0 + assert table.schema.equals(SUBMISSIONS_SCHEMA) + + def test_decimal_score_converted(self): + data = rows_to_parquet_bytes([_row(score=Decimal("1.7582115811857097E-7"))]) + table = pq.read_table(io.BytesIO(data)) + assert abs(table.column("score")[0].as_py() - 1.758e-7) < 1e-10 + + def test_none_score(self): + data = rows_to_parquet_bytes([_row(score=None)]) + table = pq.read_table(io.BytesIO(data)) + assert table.column("score")[0].as_py() is None + + def test_none_user_name_becomes_empty_string(self): + data = rows_to_parquet_bytes([_row(user_name=None)]) + table = pq.read_table(io.BytesIO(data)) + assert table.column("user_name")[0].as_py() == "" + + def test_integer_user_id_cast_to_string(self): + data = rows_to_parquet_bytes([_row(user_id=12345)]) + table = pq.read_table(io.BytesIO(data)) + assert table.column("user_id")[0].as_py() == "12345" + + def test_multiple_rows(self): + rows = [_row(submission_id=i, score=float(i)) for i in range(100)] + data = rows_to_parquet_bytes(rows) + table = pq.read_table(io.BytesIO(data)) + assert table.num_rows == 100 + + def test_none_passed(self): + data = rows_to_parquet_bytes([_row(passed=None)]) + table = pq.read_table(io.BytesIO(data)) + assert table.column("passed")[0].as_py() is None + + +# --------------------------------------------------------------------------- +# get_hf_export_rows +# --------------------------------------------------------------------------- + +class TestGetHfExportRows: + def test_empty_leaderboard_ids_skips_query(self): + db = MagicMock() + assert get_hf_export_rows(db, []) == [] + db.cursor.execute.assert_not_called() + + def test_maps_columns_correctly(self): + db = MagicMock() + db.cursor.fetchall.return_value = [ + (1, 763, "amd-mxfp4-mm", "123", "alice", 100, "sub.py", + NOW, "active", 0.001, True, "leaderboard", "MI355X", "code here"), + ] + rows = get_hf_export_rows(db, [763]) + assert len(rows) == 1 + assert rows[0]["submission_id"] == 1 + assert rows[0]["problem_name"] == "amd-mxfp4-mm" + assert rows[0]["code"] == "code here" + assert rows[0]["runner"] == "MI355X" + + def test_query_filters_secret_runs(self): + db = MagicMock() + db.cursor.fetchall.return_value = [] + get_hf_export_rows(db, [763]) + sql = db.cursor.execute.call_args[0][0] + assert "NOT r.secret" in sql + + def test_query_partitions_by_runner(self): + db = MagicMock() + db.cursor.fetchall.return_value = [] + get_hf_export_rows(db, [763]) + sql = db.cursor.execute.call_args[0][0] + assert "PARTITION BY" in sql + assert "r.runner" in sql + + def test_query_prefers_submission_job_status(self): + db = MagicMock() + db.cursor.fetchall.return_value = [] + get_hf_export_rows(db, [763]) + sql = db.cursor.execute.call_args[0][0] + assert "submission_job_status" in sql + assert "COALESCE(" in sql + assert "sjs.status" in sql + + def test_query_falls_back_to_derived_status_for_legacy_rows(self): + db = MagicMock() + db.cursor.fetchall.return_value = [] + get_hf_export_rows(db, [763]) + sql = db.cursor.execute.call_args[0][0] + assert "WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'" in sql + assert "WHEN s.done THEN 'failed'" in sql + + def test_passes_leaderboard_ids_as_param(self): + db = MagicMock() + db.cursor.fetchall.return_value = [] + get_hf_export_rows(db, [763, 764, 765]) + args = db.cursor.execute.call_args[0][1] + assert args == ([763, 764, 765],) + + +# --------------------------------------------------------------------------- +# ensure_public_export_allowed +# --------------------------------------------------------------------------- + +class TestEnsurePublicExportAllowed: + def test_expired_allowed(self): + db = MagicMock() + db.get_leaderboards.return_value = [_lb(595, "nvfp4_gemv", NOW - timedelta(days=1))] + ensure_public_export_allowed(db, [595], now=NOW) + + def test_active_blocked(self): + db = MagicMock() + db.get_leaderboards.return_value = [_lb(763, "amd-mxfp4-mm", NOW + timedelta(days=22))] + with pytest.raises(ValueError, match="Cannot export active leaderboards"): + ensure_public_export_allowed(db, [763], now=NOW) + + def test_only_checks_selected_ids(self): + db = MagicMock() + db.get_leaderboards.return_value = [ + _lb(763, "amd-mxfp4-mm", NOW + timedelta(days=22)), + _lb(595, "nvfp4_gemv", NOW - timedelta(days=1)), + ] + ensure_public_export_allowed(db, [595], now=NOW) + + def test_error_lists_active_names_sorted(self): + db = MagicMock() + db.get_leaderboards.return_value = [ + _lb(764, "amd-moe-mxfp4", NOW + timedelta(days=22)), + _lb(763, "amd-mxfp4-mm", NOW + timedelta(days=22)), + ] + with pytest.raises(ValueError, match="amd-moe-mxfp4.*amd-mxfp4-mm"): + ensure_public_export_allowed(db, [763, 764], now=NOW) From b07cc52cb9db5bb1a223bfa0164613baf5463a2e Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 00:55:32 +0300 Subject: [PATCH 4/6] apply review fixes, req hardening and in memory buffer addition --- src/kernelbot/api/main.py | 6 ++++-- src/libkernelbot/hf_export.py | 19 +++++++------------ tests/test_admin_api.py | 33 +++++++++++++++++++++++++++++++++ tests/test_hf_export.py | 29 ++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/src/kernelbot/api/main.py b/src/kernelbot/api/main.py index 92f45450..9e53ec86 100644 --- a/src/kernelbot/api/main.py +++ b/src/kernelbot/api/main.py @@ -694,9 +694,11 @@ async def admin_export_hf( filename = payload.get("filename") private = payload.get("private", True) - if not leaderboard_ids or not isinstance(leaderboard_ids, list): + if not isinstance(leaderboard_ids, list) or not leaderboard_ids: raise HTTPException(status_code=400, detail="leaderboard_ids must be a non-empty list of integers") - if not filename or not filename.endswith(".parquet"): + if not all(isinstance(leaderboard_id, int) for leaderboard_id in leaderboard_ids): + raise HTTPException(status_code=400, detail="leaderboard_ids must be a non-empty list of integers") + if not isinstance(filename, str) or not filename.endswith(".parquet"): raise HTTPException(status_code=400, detail="filename must end with .parquet") if not env.HF_TOKEN: raise HTTPException(status_code=500, detail="HF_TOKEN not configured") diff --git a/src/libkernelbot/hf_export.py b/src/libkernelbot/hf_export.py index e68d75b9..b5b8235a 100644 --- a/src/libkernelbot/hf_export.py +++ b/src/libkernelbot/hf_export.py @@ -1,7 +1,6 @@ """Export competition submissions to Hugging Face datasets as parquet files.""" import io -import tempfile from datetime import datetime, timezone import pyarrow as pa @@ -201,16 +200,13 @@ def export_to_hf( rows = get_hf_export_rows(db, leaderboard_ids) parquet_bytes = rows_to_parquet_bytes(rows) - - with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp: - tmp.write(parquet_bytes) - tmp.flush() - api.upload_file( - path_or_fileobj=tmp.name, - path_in_repo=filename, - repo_id=repo_id, - repo_type="dataset", - ) + parquet_buffer = io.BytesIO(parquet_bytes) + api.upload_file( + path_or_fileobj=parquet_buffer, + path_in_repo=filename, + repo_id=repo_id, + repo_type="dataset", + ) logger.info("Exported %d rows to %s/%s", len(rows), repo_id, filename) return {"rows": len(rows), "repo_id": repo_id, "filename": filename} @@ -219,7 +215,6 @@ def export_to_hf( def publish_to_public_repo( db: LeaderboardDB, leaderboard_ids: list[int], - private_repo_id: str, public_repo_id: str, filename: str, token: str, diff --git a/tests/test_admin_api.py b/tests/test_admin_api.py index 40fc201a..35f0e472 100644 --- a/tests/test_admin_api.py +++ b/tests/test_admin_api.py @@ -430,6 +430,39 @@ def test_update_problems_with_errors(self, test_client, mock_backend): class TestAdminExportHF: """Test admin HF export endpoint.""" + def test_export_hf_rejects_non_int_leaderboard_ids(self, test_client): + """POST /admin/export-hf returns 400 for non-integer leaderboard IDs.""" + from kernelbot.api import main as api_main + + with patch.object(api_main.env, "HF_TOKEN", "hf-token"): + response = test_client.post( + "/admin/export-hf", + headers={"Authorization": "Bearer test_token"}, + json={ + "leaderboard_ids": ["1"], + "filename": "active-comp.parquet", + "private": True, + }, + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "leaderboard_ids must be a non-empty list of integers" + + def test_export_hf_rejects_non_string_filename(self, test_client): + """POST /admin/export-hf returns 400 for non-string filenames.""" + response = test_client.post( + "/admin/export-hf", + headers={"Authorization": "Bearer test_token"}, + json={ + "leaderboard_ids": [1], + "filename": 123, + "private": True, + }, + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "filename must end with .parquet" + def test_export_hf_rejects_active_public_export(self, test_client, mock_backend): """POST /admin/export-hf returns 400 for active public exports.""" from kernelbot.api import main as api_main diff --git a/tests/test_hf_export.py b/tests/test_hf_export.py index 695f7760..4b7e2fad 100644 --- a/tests/test_hf_export.py +++ b/tests/test_hf_export.py @@ -3,7 +3,7 @@ import io from datetime import datetime, timedelta, timezone from decimal import Decimal -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pyarrow.parquet as pq import pytest @@ -11,6 +11,7 @@ from libkernelbot.hf_export import ( SUBMISSIONS_SCHEMA, ensure_public_export_allowed, + export_to_hf, get_active_competition_leaderboards, get_hf_export_rows, rows_to_parquet_bytes, @@ -246,3 +247,29 @@ def test_error_lists_active_names_sorted(self): ] with pytest.raises(ValueError, match="amd-moe-mxfp4.*amd-mxfp4-mm"): ensure_public_export_allowed(db, [763, 764], now=NOW) + + +# --------------------------------------------------------------------------- +# export_to_hf +# --------------------------------------------------------------------------- + +class TestExportToHF: + def test_uploads_from_in_memory_buffer(self): + db = MagicMock() + + with patch("libkernelbot.hf_export.HfApi") as mock_api_cls: + mock_api = mock_api_cls.return_value + with patch("libkernelbot.hf_export.get_hf_export_rows", return_value=[_row()]): + result = export_to_hf( + db=db, + leaderboard_ids=[763], + repo_id="GPUMODE/kernelbot-data-live", + filename="active_submissions.parquet", + token="hf-token", + private=True, + ) + + upload_arg = mock_api.upload_file.call_args.kwargs["path_or_fileobj"] + assert isinstance(upload_arg, io.BytesIO) + assert upload_arg.getbuffer().nbytes > 0 + assert result["rows"] == 1 From 17328f8b03175217d2bfaaf9d2c952fb5daaba3e Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 02:58:40 +0300 Subject: [PATCH 5/6] switch to standalone sql file --- pyproject.toml | 3 ++ src/kernelbot/cogs/admin_cog.py | 8 ++- src/libkernelbot/hf_export.py | 59 +++------------------ src/libkernelbot/sql/get_hf_export_rows.sql | 43 +++++++++++++++ 4 files changed, 59 insertions(+), 54 deletions(-) create mode 100644 src/libkernelbot/sql/get_hf_export_rows.sql diff --git a/pyproject.toml b/pyproject.toml index 7bc8ee3e..2b2da6cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,9 @@ dev = [ [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +libkernelbot = ["sql/*.sql"] + [tool.coverage.run] omit = ["src/libkernelbot/run_eval.py", "src/libkernelbot/launchers/*.py"] relative_files = true diff --git a/src/kernelbot/cogs/admin_cog.py b/src/kernelbot/cogs/admin_cog.py index 81da4cf5..739e309e 100644 --- a/src/kernelbot/cogs/admin_cog.py +++ b/src/kernelbot/cogs/admin_cog.py @@ -881,7 +881,13 @@ async def _scheduled_cleanup_temp_users(self): @tasks.loop(hours=24) async def _scheduled_hf_export(self): - """Daily export of active competition submissions to private HF dataset.""" + """Daily export of active competition submissions to private HF dataset. + + Once a competition expires, it drops out of the scheduled export set. If + there are still results settling after the deadline, a manual export is + needed once the queue drains. Currently public HF dataset releases are + handled manually. + """ from libkernelbot.hf_export import export_to_hf, get_active_competition_leaderboards try: diff --git a/src/libkernelbot/hf_export.py b/src/libkernelbot/hf_export.py index b5b8235a..1c00f063 100644 --- a/src/libkernelbot/hf_export.py +++ b/src/libkernelbot/hf_export.py @@ -2,6 +2,7 @@ import io from datetime import datetime, timezone +from importlib.resources import files import pyarrow as pa import pyarrow.parquet as pq @@ -11,6 +12,9 @@ from libkernelbot.utils import setup_logging logger = setup_logging(__name__) +HF_EXPORT_ROWS_SQL = files("libkernelbot").joinpath("sql/get_hf_export_rows.sql").read_text( + encoding="utf-8" +) # Explicit schema matching GPUMODE/kernelbot-data nvidia_nvfp4_submissions.parquet SUBMISSIONS_SCHEMA = pa.schema([ @@ -96,62 +100,11 @@ def ensure_public_export_allowed( def get_hf_export_rows(db: LeaderboardDB, leaderboard_ids: list[int]) -> list[dict]: - """Fetch deduplicated submissions for export. - - Deduplicates by (leaderboard_id, user_id, code_id, runner), keeping the - fastest score. Excludes secret runs. - """ + """Fetch deduplicated submissions for export.""" if not leaderboard_ids: return [] - db.cursor.execute( - """ - WITH ranked AS ( - SELECT - s.id as submission_id, - s.leaderboard_id, - l.name as problem_name, - s.user_id, - u.user_name, - s.code_id, - s.file_name, - s.submission_time, - COALESCE( - sjs.status, - CASE - WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded' - WHEN s.done THEN 'failed' - ELSE s.status - END - ) as status, - r.score, - r.passed, - r.mode, - r.runner, - COALESCE(c.old_code, convert_from(c.code, 'UTF8')) as code, - ROW_NUMBER() OVER ( - PARTITION BY s.leaderboard_id, s.user_id, s.code_id, r.runner - ORDER BY r.score ASC NULLS LAST, s.submission_time ASC - ) as rn - FROM leaderboard.submission s - JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id - LEFT JOIN leaderboard.user_info u ON s.user_id = u.id - LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id - LEFT JOIN leaderboard.runs r - ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret - LEFT JOIN leaderboard.code_files c ON s.code_id = c.id - WHERE s.leaderboard_id = ANY(%s) - ) - SELECT - submission_id, leaderboard_id, problem_name, user_id, user_name, - code_id, file_name, submission_time, status, score, passed, mode, - runner, code - FROM ranked - WHERE rn = 1 - ORDER BY problem_name, score ASC NULLS LAST - """, - (leaderboard_ids,), - ) + db.cursor.execute(HF_EXPORT_ROWS_SQL, (leaderboard_ids,)) columns = [ "submission_id", "leaderboard_id", "problem_name", "user_id", "user_name", diff --git a/src/libkernelbot/sql/get_hf_export_rows.sql b/src/libkernelbot/sql/get_hf_export_rows.sql new file mode 100644 index 00000000..8f4d3537 --- /dev/null +++ b/src/libkernelbot/sql/get_hf_export_rows.sql @@ -0,0 +1,43 @@ +WITH ranked AS ( + SELECT + s.id as submission_id, + s.leaderboard_id, + l.name as problem_name, + s.user_id, + u.user_name, + s.code_id, + s.file_name, + s.submission_time, + COALESCE( + sjs.status, + CASE + WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded' + WHEN s.done THEN 'failed' + ELSE s.status + END + ) as status, + r.score, + r.passed, + r.mode, + r.runner, + COALESCE(c.old_code, convert_from(c.code, 'UTF8')) as code, + ROW_NUMBER() OVER ( + PARTITION BY s.leaderboard_id, s.user_id, s.code_id, r.runner + ORDER BY r.score ASC NULLS LAST, s.submission_time ASC + ) as rn + FROM leaderboard.submission s + JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id + LEFT JOIN leaderboard.user_info u ON s.user_id = u.id + LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id + LEFT JOIN leaderboard.runs r + ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret + LEFT JOIN leaderboard.code_files c ON s.code_id = c.id + WHERE s.leaderboard_id = ANY(%s) +) +SELECT + submission_id, leaderboard_id, problem_name, user_id, user_name, + code_id, file_name, submission_time, status, score, passed, mode, + runner, code +FROM ranked +WHERE rn = 1 +ORDER BY problem_name, score ASC NULLS LAST From beae0d5dba231f3e9fe5becbb14210f284127f2c Mon Sep 17 00:00:00 2001 From: Sinatras Date: Mon, 9 Mar 2026 02:59:46 +0300 Subject: [PATCH 6/6] switch to tempfile, hf xet support is not available over inmemorybuffer --- src/libkernelbot/hf_export.py | 17 ++++++++++------- tests/test_hf_export.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/libkernelbot/hf_export.py b/src/libkernelbot/hf_export.py index 1c00f063..4fd12ad0 100644 --- a/src/libkernelbot/hf_export.py +++ b/src/libkernelbot/hf_export.py @@ -1,6 +1,7 @@ """Export competition submissions to Hugging Face datasets as parquet files.""" import io +import tempfile from datetime import datetime, timezone from importlib.resources import files @@ -153,13 +154,15 @@ def export_to_hf( rows = get_hf_export_rows(db, leaderboard_ids) parquet_bytes = rows_to_parquet_bytes(rows) - parquet_buffer = io.BytesIO(parquet_bytes) - api.upload_file( - path_or_fileobj=parquet_buffer, - path_in_repo=filename, - repo_id=repo_id, - repo_type="dataset", - ) + with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp: + tmp.write(parquet_bytes) + tmp.flush() + api.upload_file( + path_or_fileobj=tmp.name, + path_in_repo=filename, + repo_id=repo_id, + repo_type="dataset", + ) logger.info("Exported %d rows to %s/%s", len(rows), repo_id, filename) return {"rows": len(rows), "repo_id": repo_id, "filename": filename} diff --git a/tests/test_hf_export.py b/tests/test_hf_export.py index 4b7e2fad..83708e1e 100644 --- a/tests/test_hf_export.py +++ b/tests/test_hf_export.py @@ -254,11 +254,18 @@ def test_error_lists_active_names_sorted(self): # --------------------------------------------------------------------------- class TestExportToHF: - def test_uploads_from_in_memory_buffer(self): + def test_uploads_from_temp_parquet_file(self): db = MagicMock() + observed = {} with patch("libkernelbot.hf_export.HfApi") as mock_api_cls: mock_api = mock_api_cls.return_value + def _capture_upload(**kwargs): + path = kwargs["path_or_fileobj"] + observed["path"] = path + observed["size"] = __import__("os").path.getsize(path) + + mock_api.upload_file.side_effect = _capture_upload with patch("libkernelbot.hf_export.get_hf_export_rows", return_value=[_row()]): result = export_to_hf( db=db, @@ -270,6 +277,7 @@ def test_uploads_from_in_memory_buffer(self): ) upload_arg = mock_api.upload_file.call_args.kwargs["path_or_fileobj"] - assert isinstance(upload_arg, io.BytesIO) - assert upload_arg.getbuffer().nbytes > 0 + assert isinstance(upload_arg, str) + assert upload_arg.endswith(".parquet") + assert observed["size"] > 0 assert result["rows"] == 1