一个用于从 Hugging Face 下载文件的简易工具
I’m ready to translate the article for you, but I need the full text of the post (the content you’d like translated). Could you please paste the article’s body here? Once I have that, I’ll provide a Simplified Chinese translation while preserving the source link, formatting, markdown, and any code blocks or URLs unchanged.
hf_get_from_url.py – 从 Hugging Face 仓库下载文件
开发机器学习模型的开发者经常需要从 Hugging Face 仓库下载文件。虽然 Hugging Face 网站提供了链接,但手动处理 URL 和路径会比较麻烦。脚本 hf_get_from_url.py 通过解析多种输入格式并调用 Hugging Face CLI (hf) 来获取所需文件,从而简化了这一过程。
脚本能够处理的情况
| 输入形式 | 示例 |
|---|---|
带有 blob 或 resolve 的完整 Hugging Face URL | https://huggingface.co/owner/repo/blob/main/file.gguf |
| 缩写形式(无协议) | huggingface.co/owner/repo/blob/main/file.gguf |
| 仓库式路径(owner/repo/file) | owner/repo/file.gguf |
| 直接的仓库引用(可带或不带文件路径) | owner/repo 或 owner/repo/path/to/dir |
解析输入后,脚本使用 Hugging Face 命令行界面将指定文件下载到本地目录。
主要特性
- 灵活的输入解析 – 支持完整 URL、缩写 URL 以及普通的
owner/repo/path字符串。 - 干运行模式 –
--dry-run会打印将要执行的完整命令,但不实际下载。 - 自定义本地目录 –
--flatten-localdir会将文件存入一个使用连字符 (owner-repo) 而非斜杠的文件夹,使用更方便。 - 错误处理 – 检查是否已安装
hf命令,并在出现问题时提供明确的错误信息。
使用示例
# 使用完整 URL 下载
python hf_get_from_url.py "https://huggingface.co/owner/repo/blob/main/file.gguf"
# 使用仓库式路径下载
python hf_get_from_url.py owner/repo/file.gguf
# 预览命令而不实际下载
python hf_get_from_url.py --dry-run "huggingface.co/owner/repo/blob/main/file.gguf"
这些示例展示了该工具如何为经常从 Hugging Face 下载模型、数据集或配置文件的用户简化工作流程。
源代码
#!/usr/bin/env python3
"""
hf_get_from_url.py
Download files or directories from Hugging Face Hub using huggingface_hub API.
Usage:
python hf_get_from_url.py [--dry-run] [--flatten-localdir] [ ...]
Examples:
python hf_get_from_url.py "https://huggingface.co/owner/repo/blob/main/path/to/file.gguf"
python hf_get_from_url.py owner/repo/path/to/file.gguf
python hf_get_from_url.py --dry-run "huggingface.co/owner/repo/blob/main/models"
Notes:
- Requires: pip install huggingface_hub
- Authentication (if needed) is taken from env HF_TOKEN or huggingface_hub.login()
"""
from __future__ import annotations
import argparse
import sys
import re
from urllib.parse import urlparse, unquote
from typing import Optional, Tuple, List
# huggingface_hub API
try:
from huggingface_hub import hf_hub_download, snapshot_download
except Exception:
hf_hub_download = None
snapshot_download = None
# ----------------------------------------------------------------------
# Regex patterns
# ----------------------------------------------------------------------
RE_BLOB_RESOLVE = re.compile(
r'^(?:https?://)?(?:www\.)?huggingface\.co/'
r'(?P[^/]+/[^/]+)/(?:blob|resolve)/'
r'(?P[^/]+)/(?P.+)$'
)
RE_NO_PREFIX = re.compile(
r'^(?P[^/]+/[^/]+)/(?:blob|resolve)/'
r'(?P[^/]+)/(?P.+)$'
)
RE_SIMPLE = re.compile(
r'^(?P[^/]+/[^/]+)(?:/(?P.+))?$'
)
# ----------------------------------------------------------------------
# Input parser
# ----------------------------------------------------------------------
def parse_input(s: str) -> Optional[Tuple[str, Optional[str], str]]:
"""
Parse input and return (repo, revision, path)
revision may be None (meaning default branch)
"""
s = s.strip()
s = unquote(s.split('?', 1)[0].split('#', 1)[0]).rstrip('/')
# 1) Explicit Hugging Face URL (blob/resolve)
m = RE_BLOB_RESOLVE.match(s)
if m:
return m.group('repo'), m.group('rev'), m.group('path')
# 2) huggingface.co/... without scheme
if s.startswith('huggingface.co/'):
candidate = s[len('huggingface.co/'):].lstrip('/')
m2 = RE_NO_PREFIX.match(candidate)
if m2:
return m2.group('repo'), m2.group('rev'), m2.group('path')
m2 = RE_SIMPLE.match(candidate)
if m2 and m2.group('path'):
return m2.group('repo'), None, m2.group('path')
# 3) Generic URL parse
try:
p = urlparse(s)
except Exception:
p = None
if p and p.netloc and 'huggingface' in p.netloc:
parts = p.path.lstrip('/').split('/')
if len(parts) >= 5 and parts[2] in ('blob', 'resolve'):
repo = f"{parts[0]}/{parts[1]}"
rev = parts[3]
path = '/'.join(parts[4:])
return repo, rev, path
elif len(parts) >= 3:
repo = f"{parts[0]}/{parts[1]}"
path = '/'.join(parts[2:])
return repo, None, path
# 4) Direct repo/path
m3 = RE_NO_PREFIX.match(s)
if m3:
return m3.group('repo'), m3.group('rev'), m3.group('path')
m4 = RE_SIMPLE.match(s)
if m4 and m4.group('path'):
return m4.group('repo'), None, m4.group('path')
return None
# ----------------------------------------------------------------------
# Download logic
# ----------------------------------------------------------------------
def run_hf_download_api(
repo: str,
path: str,
rev: Optional[str],
local_dir: Optional[str],
dry_run: bool,
) -> int:
"""
Try single‑file download first; if it fails, fall back to snapshot_download
(directory or pattern).
"""
if hf_hub_download is None or snapshot_download is None:
print(
"Error: huggingface_hub is not installed. "
"Please run `pip install huggingface_hub`.",
file=sys.stderr,
)
return 2
if local_dir is None:
local_dir = repo
rev_disp = rev if rev else "default"
Source:
print(f"> (api) download {repo}@{rev_disp} {path} -> local_dir={local_dir}")
if dry_run:
return 0
# ---- Try as single file ----
try:
local_path = hf_hub_download(
repo_id=repo,
filename=path,
revision=rev,
local_dir=local_dir,
)
print(f"✔ Downloaded file to {local_path}")
return 0
except Exception as e_file:
print(f"⚠ Single‑file download failed: {e_file}", file=sys.stderr)
# ---- Fallback: download whole repo or sub‑directory ----
try:
snapshot_download(
repo_id=repo,
revision=rev,
local_dir=local_dir,
allow_patterns=[f"{path}*"],
)
print(f"✔ Snapshot downloaded to {local_dir}")
return 0
except Exception as e_snap:
print(f"❌ Snapshot download failed: {e_snap}", file=sys.stderr)
return 1
# ----------------------------------------------------------------------
# CLI entry point
# ----------------------------------------------------------------------
def main(argv: List[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Download files from Hugging Face Hub using flexible input formats."
)
parser.add_argument(
"inputs",
nargs="+",
help="URL, repo/path, or shortened Hugging Face reference.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show the commands that would be run without downloading.",
)
parser.add_argument(
"--flatten-localdir",
action="store_true",
help="Replace '/' with '-' in the local directory name.",
)
args = parser.parse_args(argv)
exit_code = 0
for inp in args.inputs:
parsed = parse_input(inp)
if not parsed:
print(f"❌ Could not parse input: {inp}", file=sys.stderr)
exit_code = 1
continue
repo, rev, path = parsed
local_dir = repo.replace("/", "-") if args.flatten_localdir else None
rc = run_hf_download_api(
repo=repo,
path=path,
rev=rev,
local_dir=local_dir,
dry_run=args.dry_run,
)
if rc != 0:
exit_code = rc
return exit_code
if __name__ == "__main__":
sys.exit(main())
此工具简化了经常从 Hugging Face 下载模型、数据集或配置文件的工作流程。通过接受不同的输入形式并提供清晰的反馈,它使过程更快且更不易出错。
filename = path,
revision = rev,
local_dir = local_dir,
)
print(f"Downloaded file: {local_path}")
return 0
except Exception as e:
print(
f"hf_hub_download failed: {e}. "
"Trying snapshot_download for directory/pattern...",
file=sys.stderr,
)
# ---- Fallback: directory or glob ----
allow_pattern = path.rstrip("/") + "/*"
try:
repo_local_dir = snapshot_download(
repo_id=repo,
revision=rev,
local_dir=local_dir,
allow_patterns=[allow_pattern],
)
print(f"Snapshot downloaded into: {repo_local_dir}")
return 0
except Exception as e:
print(f"snapshot_download failed: {e}", file=sys.stderr)
return 3
# ----------------------------------------------------------------------
# Main
# ----------------------------------------------------------------------
def main(argv: List[str]) -> int:
parser = argparse.ArgumentParser(
description="Download files or directories from Hugging Face Hub"
)
parser.add_argument(
"inputs",
nargs="+",
help="Hugging Face URL or /path",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print actions without executing",
)
parser.add_argument(
"--hf-cmd",
default="hf",
help="(ignored, kept for compatibility)",
parser.add_argument(
"--flatten-localdir",
action="store_true",
help="Replace '/' with '-' in local directory name",
)
args = parser.parse_args(argv)
any_failed = False
for s in args.inputs:
parsed = parse_input(s)
if not parsed:
print(f"Failed to parse input: {s}", file=sys.stderr)
any_failed = True
continue
repo, rev, path = parsed
if not path:
print(f"No file path extracted for input: {s}", file=sys.stderr)
any_failed = True
continue
local_dir = repo.replace("/", "-") if args.flatten_localdir else repo
rc = run_hf_download_api(
repo=repo,
path=path,
rev=rev,
local_dir=local_dir,
dry_run=args.dry_run,
)
if rc != 0:
any_failed = True
return 1 if any_failed else 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))