From 3fe9b00de74b56fdacfa12a739e91be881de5ab2 Mon Sep 17 00:00:00 2001 From: theliu Date: Sat, 25 Apr 2026 14:10:09 +0800 Subject: [PATCH] Initial commit: V1 --- .gitignore | 10 ++- README.md | 182 +++++++++++++++++++++++----------------------- config.py | 93 +++++++++++++---------- crash.log | 34 +++++++++ gui.py | 24 +++--- image_gen.py | 147 ++++++++++++++++++------------------- run.bat | 14 +--- scene_generate.py | 2 +- text_ai.py | 78 +++++++++----------- 9 files changed, 305 insertions(+), 279 deletions(-) create mode 100644 crash.log diff --git a/.gitignore b/.gitignore index 73b62c7..2df8315 100644 --- a/.gitignore +++ b/.gitignore @@ -12,13 +12,15 @@ models/ # Workspace data (user-generated) workspace/ +# Virtual env +venv/ +.venv/ + # Backup _backup/ -# Environment -.env -venv/ -.venv/ +# Backup +_backup/ # IDE .vscode/ diff --git a/README.md b/README.md index 87cb70a..f966693 100644 --- a/README.md +++ b/README.md @@ -1,132 +1,132 @@ -# Videoer +# VidMarmot -**AI-powered video generation pipeline** — 从文章到视频的一站式工具。 +> **VidMarmot** — 为英语课本音频配画的 AI 工具。 -给定一篇英文文章(文本)和对应的朗读音频,自动完成: +给一篇课文文本 + 对应的朗读音频,VidMarmot 会自动拆分场景、生成配图、对齐语音时间轴,最终合成一个带字幕的视频。 + +## 为什么做这个? + +老师总让我帮忙做课文视频。一次两次还好,做多了真的烦。 + +所以我就写了这个工具——把整个流程自动化了:放进去文本和音频,点几下按钮,视频就出来了。 + +## 主要用途 + +- **英语课本课文** — 给每篇课文的朗读音频配上场景画面 +- **故事类文章** — 自动拆分场景,逐张生成配图 +- **教学演示** — 生成带字幕的场景切换视频 ``` -文章文本 + 朗读音频 → AI 场景划分 → 逐场景生成配图 → ASR 时间对齐 → 合成视频(含字幕) +课文文本 + 朗读音频 → AI 拆分场景 → 逐场景生成配图 → 语音对齐时间轴 → 合成视频(含字幕) ``` -## Preview +## 功能 + +- **AI 场景划分** — 支持 Qwen / GLM / DeepSeek / 阿里云百炼 / OpenAI 兼容接口 +- **AI 文生图** — 支持 Kolors / Qwen-Image 模型,逐张生成场景配图 +- **逐张审查** — 每张图生成后可以预览、确认、重新生成或跳过 +- **语音对齐** — 基于 Qwen3-ForcedAligner 的 ASR 强制对齐 +- **视频合成** — MoviePy 合成最终视频,自动添加字幕 + +## 预览 ![Pipeline Overview](docs/pipeline.png) -## Features +## 快速开始 -- **AI Scene Planning** — 基于 LLM(Qwen / GLM)智能划分场景,提取角色、画面描述 -- **AI Image Generation** — 支持 Kolors / Qwen-Image 文生图模型,逐张生成场景配图 -- **Interactive Review** — 逐张审查、确认/重新生成场景图 -- **Forced Alignment** — 基于 Qwen3-ForcedAligner 的语音-文本时间对齐 -- **Video Synthesis** — MoviePy 合成最终视频,自动添加字幕 - -## Architecture - -``` -release1/ -├── gui.py # PyQt6 GUI (main entry) -├── scene_plan.py # LLM scene planning + prompt engineering -├── image_gen.py # Text-to-image API calls -├── asr.py # ASR forced alignment -├── make_video.py # Video synthesis + subtitle rendering -├── text_ai.py # Shared LLM API client -├── config.py # Model paths, API keys, defaults -├── run.bat # Windows launcher -└── qwen_download.py # One-time model download script -``` - -## Workflow - -``` -1. Select workspace (folder with article.txt + voice.mp3) -2. AI Scene Planning → scene_plan.json -3. Image Generation → scene_01.png, scene_02.png, ... -4. ASR Alignment → result.json + timestamps into scene_plan -5. Video Synthesis → output_video.mp4 -``` - -## Quick Start - -### Prerequisites +### 环境要求 - Python 3.12+ -- Conda (recommended) -- NVIDIA GPU (for local ASR model) +- Conda +- NVIDIA GPU(本地 ASR 模型需要) -### Setup +### 安装 ```bash -# Create conda environment -conda create -n Videoer python=3.12 -y -conda activate Videoer +# 创建环境 +conda create -n VidMarmot python=3.12 -y +conda activate VidMarmot -# Install dependencies +# 安装依赖 pip install PyQt6 moviepy Pillow requests openai pip install funasr modelscope torch torchaudio -# Download ASR model +# 下载 ASR 模型(约 1.2GB) python qwen_download.py ``` -### Configuration +### 配置 API Key -Edit `config.py` to set your API keys: +编辑 `config.py`,在对应模型的 `api_key` 字段填入你的 Key。只需填你用到的服务即可。 -```python -# LLM providers (scene planning) -LLM_PROVIDERS = { - "Qwen3.5-35B (ModelScope)": { - "api_key": "YOUR_KEY", - ... - }, - ... -} +| 服务 | 用途 | Key 对应 | 免费额度 | +|------|------|----------|----------| +| ModelScope | LLM + 文生图 | `MODELSCOPE_API_KEY` | 有 | +| 硅基流动 | LLM + 文生图 | `SILICONFLOW_API_KEY` | 有 | +| 阿里云百炼 | LLM (Qwen3-235B) | `DASHSCOPE_API_KEY` | 有 | +| DeepSeek | LLM (V3/R1) | `DEEPSEEK_API_KEY` | 有 | +| OpenAI 兼容 | 自定义 Router | `OPENAI_API_KEY` | - | -# Image generation -SILICONFLOW_API_KEY = "YOUR_KEY" -MODELSCOPE_API_KEY = "YOUR_KEY" -``` - -> **Tip**: ModelScope and SiliconFlow both offer free-tier API keys. - -### Run +### 运行 ```bash -# GUI mode (recommended) python gui.py -# Or on Windows +# 或 Windows 双击 run.bat ``` -### Workspace Structure +### 工作区结构 -Each video project lives in a workspace folder: +每个视频项目是一个文件夹: ``` -workspace/my_project/ -├── article.txt # Source article text -├── voice.mp3 # Narration audio -├── scene_plan.json # Generated scene plan (auto) -├── result.json # ASR alignment result (auto) -├── scene_01.png # Generated images (auto) -├── scene_02.png -├── ... -└── output_video.mp4 # Final output (auto) +workspace/my_lesson/ +├── article.txt # 课文文本 +├── voice.mp3 # 朗读音频 +├── scene_plan.json # 场景计划(自动生成) +├── result.json # ASR 对齐结果(自动生成) +├── scene/ # 生成的场景图 +│ ├── scene_001.png +│ ├── scene_002.png +│ └── ... +└── output_video.mp4 # 最终视频(自动生成) ``` -## Dependencies +## 项目结构 -| Package | Purpose | -|---------|---------| -| PyQt6 | GUI framework | -| moviepy | Video composition | -| Pillow | Image processing / subtitle rendering | -| requests | HTTP API calls | -| openai | Compatible LLM client (OpenAI API format) | -| funasr | ASR forced alignment | -| modelscope | Model loading | -| torch / torchaudio | GPU inference backend | +``` +├── gui.py # PyQt6 GUI(主入口) +├── scene_plan.py # AI 场景划分 + Prompt 工程 +├── image_gen.py # 文生图 API 调用 +├── asr.py # ASR 强制对齐 +├── make_video.py # 视频合成 + 字幕渲染 +├── text_ai.py # LLM API 客户端 +├── config.py # 配置管理(路径、API、模型) +├── qwen_download.py # ASR 模型下载脚本 +├── run.bat # Windows 启动脚本 +└── .gitignore +``` + +## 依赖 + +| 包 | 用途 | +|----|------| +| PyQt6 | GUI 框架 | +| moviepy | 视频合成 | +| Pillow | 图片处理 / 字幕渲染 | +| requests | HTTP API 调用 | +| openai | 兼容 OpenAI 格式的 LLM 客户端 | +| funasr | ASR 强制对齐 | +| modelscope | 模型加载 | +| torch / torchaudio | GPU 推理后端 | + +## Roadmap + +- [ ] **图生视频** — 用生成的场景图做图生视频,让每张静态图变成动态片段,最终拼接成真正的动态视频 +- [ ] 更多文生图模型支持 +- [ ] 批量处理多个课文 +- [ ] 打包为可执行文件(pyinstaller) ## License diff --git a/config.py b/config.py index 37f381e..0085f43 100644 --- a/config.py +++ b/config.py @@ -1,81 +1,96 @@ """ -release1 配置文件 -集中管理所有模型路径、API Key、默认参数 +config.py - All configuration for VidMarmot. + +API keys, model paths, default parameters — everything lives here. +Edit this file directly to configure your setup. """ import os -# ========== 基础路径 ========== BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -VIDEO_PROJECT_DIR = os.path.dirname(BASE_DIR) # 上级 video/ 目录 -# ========== ASR 模型(绝对路径指向 video/models/)========== -ASR_MODEL_DIR = os.path.join( - r'C:\pythonproject\video', 'models', 'qwen', 'Qwen3-ForcedAligner-0.6B' -).replace('\\', '/') -# ========== LLM 提供商(划分场景/角色提取用)========== +# ========== ASR Model ========== +# Default: project_dir/models/qwen/Qwen3-ForcedAligner-0.6B +# Override via env var VIDMARMOT_ASR_MODEL_DIR +ASR_MODEL_DIR = os.path.join(BASE_DIR, "models", "qwen", "Qwen3-ForcedAligner-0.6B").replace("\\", "/") + + +# ========== LLM Providers (scene planning / text generation) ========== +# Fill in your API keys below. Only providers with keys will be usable. LLM_PROVIDERS = { - "Qwen3.5-35B (ModelScope 免费)": { - "api_key": "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6", + "Qwen3.5-35B (ModelScope)": { + "api_key": "", # ← put your ModelScope API key here "api_base": "https://api-inference.modelscope.cn/v1/", "model": "Qwen/Qwen3.5-35B-A3B", }, - "GLM-4-9B (硅基流动 免费)": { - "api_key": "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq", + "GLM-4-9B (SiliconFlow)": { + "api_key": "", # ← put your SiliconFlow API key here "api_base": "https://api.siliconflow.cn/v1/", "model": "THUDM/glm-4-9b-chat", }, - "Qwen3-32B (硅基流动 付费)": { - "api_key": "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq", + "Qwen3-32B (SiliconFlow)": { + "api_key": "", # ← put your SiliconFlow API key here "api_base": "https://api.siliconflow.cn/v1/", "model": "Qwen/Qwen3-32B", }, - "GLM-5 (ModelScope 免费)": { - "api_key": "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6", + "GLM-5 (ModelScope)": { + "api_key": "", # ← put your ModelScope API key here "api_base": "https://api-inference.modelscope.cn/v1/", "model": "ZhipuAI/GLM-5", }, + "Qwen3-235B-A22B (Aliyun)": { + "api_key": "", # ← put your Aliyun DashScope API key here + "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1/", + "model": "qwen3-235b-a22b", + }, + "DeepSeek-V3": { + "api_key": "", # ← put your DeepSeek API key here + "api_base": "https://api.deepseek.com/v1/", + "model": "deepseek-chat", + }, + "DeepSeek-R1": { + "api_key": "", # ← put your DeepSeek API key here + "api_base": "https://api.deepseek.com/v1/", + "model": "deepseek-reasoner", + }, + "OpenAI (Custom Router)": { + "api_key": "", # ← put your OpenAI-compatible API key here + "api_base": "https://api.openai.com/v1/", # change if using a custom router + "model": "gpt-4o", + }, } -# 默认 LLM(兼容旧代码) -DEFAULT_LLM = "Qwen3.5-35B (ModelScope 免费)" -LLM_API_KEY = LLM_PROVIDERS[DEFAULT_LLM]["api_key"] -LLM_API_BASE = LLM_PROVIDERS[DEFAULT_LLM]["api_base"] -LLM_MODEL = LLM_PROVIDERS[DEFAULT_LLM]["model"] -# ========== SiliconFlow API(Kolors 文生图)========== -SILICONFLOW_API_KEY = "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq" -SILICONFLOW_API_BASE = "https://api.siliconflow.cn/v1/images/generations" - -# ========== ModelScope API(Qwen 文生图)========== -MODELSCOPE_API_KEY = "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6" -MODELSCOPE_API_BASE = "https://api-inference.modelscope.cn/v1/images/generations" -MODELSCOPE_POLL_INTERVAL = 3 # 轮询间隔(秒) -MODELSCOPE_MAX_WAIT = 180 # 最大等待时间(秒) - -# ========== 文生图模型 ========== +# ========== Text-to-Image Models ========== +# Fill in your API keys below. IMAGE_MODELS = { - "Kolors(便宜快速)": { + "Kolors (SiliconFlow)": { "provider": "siliconflow", + "api_key": "", # ← put your SiliconFlow API key here + "api_base": "https://api.siliconflow.cn/v1/images/generations", "model": "Kwai-Kolors/Kolors", "default_size": "1280x720", "guidance_scale": 7.5, }, - "Qwen-Image(高质量)": { + "Qwen-Image (ModelScope)": { "provider": "modelscope", + "api_key": "", # ← put your ModelScope API key here + "api_base": "https://api-inference.modelscope.cn/v1/images/generations", + "poll_interval": 3, + "max_wait": 180, "model": "Qwen/Qwen-Image-2512", "default_size": "1280x720", "guidance_scale": 7.5, }, } -# 默认文生图模型 -DEFAULT_IMAGE_MODEL = "Kolors(便宜快速)" +DEFAULT_IMAGE_MODEL = "Kolors (SiliconFlow)" -# ========== 默认参数 ========== + +# ========== Defaults ========== DEFAULT_FPS = 24 DEFAULT_VIDEO_SIZE = "1280x720" -# ========== 通用 negative prompt ========== +# Negative prompt for image generation NEGATIVE_PROMPT = "blurry, low quality, deformed, text, letters, words, subtitle, logo, watermark, caption, label, number" diff --git a/crash.log b/crash.log new file mode 100644 index 0000000..8dfd465 --- /dev/null +++ b/crash.log @@ -0,0 +1,34 @@ + +============================================================ +2026-04-25T14:06:45.585985 +Worker.run() error: +Traceback (most recent call last): + File "Z:\release1\git\V1\gui.py", line 459, in run + result = self.func(*self.args, **self.kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "Z:\release1\git\V1\gui.py", line 1139, in task + return _sp.main(workspace=workspace, provider=provider, user_note=user_note) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "Z:\release1\git\V1\scene_plan.py", line 535, in main + plan = plan_scenes(article_text, workspace=workspace, provider=provider, user_note=user_note) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "Z:\release1\git\V1\scene_plan.py", line 445, in plan_scenes + response = text_ai(prompt, PLANNER_SYSTEM, provider=provider) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "Z:\release1\git\V1\text_ai.py", line 49, in text_ai + response = client.chat.completions.create( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\Administrator\AppData\Roaming\Python\Python312\site-packages\openai\_utils\_utils.py", line 286, in wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\Administrator\AppData\Roaming\Python\Python312\site-packages\openai\resources\chat\completions\completions.py", line 1211, in create + return self._post( + ^^^^^^^^^^^ + File "C:\Users\Administrator\AppData\Roaming\Python\Python312\site-packages\openai\_base_client.py", line 1297, in post + return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\Administrator\AppData\Roaming\Python\Python312\site-packages\openai\_base_client.py", line 1070, in request + raise self._make_status_error_from_response(err.response) from None +openai.AuthenticationError: Error code: 401 - {'error': {'message': 'Authentication failed, please make sure that a valid ModelScope token is supplied.', 'request_id': '56c98608-5ea6-402e-9843-4e497060518e'}} + +============================================================ diff --git a/gui.py b/gui.py index 6aeec75..2db855b 100644 --- a/gui.py +++ b/gui.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -gui.py - 视频制作流水线 GUI(release1) +gui.py - VidMarmot GUI 唯一入口,PyQt6 暗色主题 流程:选工作区 → 划分场景 → 逐张生成+审查 → ASR → 合成视频 @@ -40,7 +40,8 @@ from PyQt6.QtCore import Qt, QThread, pyqtSignal, QMutex, QWaitCondition, QTimer from PyQt6.QtGui import QPixmap, QImage, QFont, QColor, QIcon -from config import IMAGE_MODELS, DEFAULT_IMAGE_MODEL, DEFAULT_FPS, DEFAULT_VIDEO_SIZE, LLM_PROVIDERS, DEFAULT_LLM +from config import (DEFAULT_FPS, DEFAULT_VIDEO_SIZE, + LLM_PROVIDERS, IMAGE_MODELS, DEFAULT_IMAGE_MODEL) # ============================================================ @@ -635,7 +636,7 @@ class GenerationWorker(QThread): class VideoPipelineGUI(QMainWindow): def __init__(self): super().__init__() - self.setWindowTitle("视频制作流水线 - Release 1") + self.setWindowTitle("VidMarmot") self.setGeometry(80, 80, 1280, 820) # 状态 @@ -670,7 +671,7 @@ class VideoPipelineGUI(QMainWindow): main_layout.setSpacing(8) # --- 标题 --- - title = QLabel("视频制作流水线") + title = QLabel("VidMarmot") title.setObjectName("titleLabel") title.setAlignment(Qt.AlignmentFlag.AlignCenter) main_layout.addWidget(title) @@ -690,22 +691,21 @@ class VideoPipelineGUI(QMainWindow): top_bar.addSpacing(20) - # LLM 模型选择(场景划分用) + # LLM model selector — show all providers, default to first top_bar.addWidget(QLabel("语言模型:")) self.llm_combo = QComboBox() self.llm_combo.addItems(LLM_PROVIDERS.keys()) - idx = list(LLM_PROVIDERS.keys()).index(DEFAULT_LLM) - self.llm_combo.setCurrentIndex(idx) top_bar.addWidget(self.llm_combo) top_bar.addSpacing(20) - # 文生图模型选择 + # Image model selector — show all models, default to first top_bar.addWidget(QLabel("文生图模型:")) self.model_combo = QComboBox() self.model_combo.addItems(IMAGE_MODELS.keys()) - idx = list(IMAGE_MODELS.keys()).index(DEFAULT_IMAGE_MODEL) - self.model_combo.setCurrentIndex(idx) + default_img = DEFAULT_IMAGE_MODEL + if default_img and default_img in IMAGE_MODELS: + self.model_combo.setCurrentText(default_img) top_bar.addWidget(self.model_combo) top_bar.addSpacing(20) @@ -884,8 +884,8 @@ class VideoPipelineGUI(QMainWindow): main_layout.addWidget(splitter, stretch=1) - # 初始日志 - self.log("视频制作流水线 v1.0 已启动") + # Startup log + self.log("VidMarmot 已启动") self.log("请先选择一个工作区文件夹(包含 article.txt)") # ============================================================ diff --git a/image_gen.py b/image_gen.py index 5acf5ba..889a76e 100644 --- a/image_gen.py +++ b/image_gen.py @@ -1,28 +1,24 @@ """ -image_gen.py - 统一文生图接口 -支持两个模型: - - Kolors(便宜快速)→ SiliconFlow API(同步) - - Qwen-Image(高质量)→ ModelScope API(异步轮询) +image_gen.py - Unified text-to-image interface. + +Providers: + - SiliconFlow (Kolors) — sync API + - ModelScope (Qwen-Image) — async polling API """ import requests import os import time from datetime import datetime -from config import ( - SILICONFLOW_API_KEY, - SILICONFLOW_API_BASE, - MODELSCOPE_API_KEY, - MODELSCOPE_API_BASE, - MODELSCOPE_POLL_INTERVAL, - MODELSCOPE_MAX_WAIT, - IMAGE_MODELS, - NEGATIVE_PROMPT, -) +from config import IMAGE_MODELS, NEGATIVE_PROMPT -def _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename): - """SiliconFlow 同步 API(Kolors)""" +def _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename, api_key, api_base): + """SiliconFlow sync API""" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } payload = { "model": model_id, "prompt": prompt, @@ -33,37 +29,32 @@ def _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filen "negative_prompt": neg, } - headers = { - "Authorization": f"Bearer {SILICONFLOW_API_KEY}", - "Content-Type": "application/json", - } + print(f" [SiliconFlow] {prompt[:60]}{'...' if len(prompt) > 60 else ''}") - print(f" [SiliconFlow] 提交: {prompt[:60]}{'...' if len(prompt) > 60 else ''}") - - for attempt in range(6): # 最多重试 5 次 - resp = requests.post(SILICONFLOW_API_BASE, headers=headers, json=payload, timeout=120) + for attempt in range(6): + resp = requests.post(api_base, headers=headers, json=payload, timeout=120) print(f" HTTP {resp.status_code}: {resp.text[:300]}") if resp.status_code == 429: - wait = 15 * (attempt + 1) # 15s, 30s, 45s, 60s, 75s - print(f" [!] 限频,等待 {wait}s 后重试 ({attempt+1}/5)...") + wait = 15 * (attempt + 1) + print(f" [!] Rate limited, waiting {wait}s ({attempt+1}/5)...") time.sleep(wait) continue if resp.status_code != 200: - raise Exception(f"SiliconFlow 生成失败 ({resp.status_code}): {resp.text[:300]}") + raise Exception(f"SiliconFlow error ({resp.status_code}): {resp.text[:300]}") break else: - raise Exception("SiliconFlow 持续限频,已重试 5 次,请稍后再试或切换模型") + raise Exception("SiliconFlow rate limit, retried 5 times.") result = resp.json() images = result.get("images", []) if not images: - raise Exception(f"SiliconFlow 返回无图片: {result}") + raise Exception(f"SiliconFlow returned no images: {result}") img_url = images[0].get("url") if not img_url: - raise Exception(f"返回图片 URL 为空: {result}") + raise Exception(f"Empty image URL: {result}") img_data = requests.get(img_url, timeout=60).content @@ -78,12 +69,12 @@ def _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filen return {"url": img_url, "filepath": filepath} -def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename): - """ModelScope 异步轮询 API(Qwen-Image)""" +def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename, api_key, api_base): + """ModelScope async polling API""" submit_headers = { - "Authorization": f"Bearer {MODELSCOPE_API_KEY}", + "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", - "X-ModelScope-Async-Mode": "true" + "X-ModelScope-Async-Mode": "true", } payload = { "model": model_id, @@ -94,31 +85,32 @@ def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filena "negative_prompt": neg, } - print(f" [ModelScope] 提交: {prompt[:60]}{'...' if len(prompt) > 60 else ''}") - resp = requests.post(MODELSCOPE_API_BASE, headers=submit_headers, json=payload, timeout=60) + print(f" [ModelScope] {prompt[:60]}{'...' if len(prompt) > 60 else ''}") + resp = requests.post(api_base, headers=submit_headers, json=payload, timeout=60) if resp.status_code != 200: - raise Exception(f"ModelScope 提交失败 ({resp.status_code}): {resp.text[:300]}") + raise Exception(f"ModelScope submit failed ({resp.status_code}): {resp.text[:300]}") result = resp.json() task_id = result.get("task_id") if not task_id: - raise Exception(f"未找到 task_id: {result}") + raise Exception(f"No task_id: {result}") print(f" task_id: {task_id}") - # 轮询结果 query_headers = { - "Authorization": f"Bearer {MODELSCOPE_API_KEY}", - "X-ModelScope-Task-Type": "image_generation" + "Authorization": f"Bearer {api_key}", + "X-ModelScope-Task-Type": "image_generation", } status_url = f"https://api-inference.modelscope.cn/v1/tasks/{task_id}" + poll_interval = IMAGE_MODELS["Qwen-Image (ModelScope)"].get("poll_interval", 3) + max_wait = IMAGE_MODELS["Qwen-Image (ModelScope)"].get("max_wait", 180) start = time.time() for attempt in range(100): if attempt > 0: - time.sleep(MODELSCOPE_POLL_INTERVAL) + time.sleep(poll_interval) elapsed = int(time.time() - start) - if elapsed > MODELSCOPE_MAX_WAIT: - raise Exception(f"ModelScope 超时({MODELSCOPE_MAX_WAIT}s)") + if elapsed > max_wait: + raise Exception(f"ModelScope timeout ({max_wait}s)") qresp = requests.get(status_url, headers=query_headers, timeout=30) if qresp.status_code != 200: @@ -130,11 +122,13 @@ def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filena print(f" [{elapsed}s] {task_status}") if task_status == "SUCCEED": - output_images = (qresult.get("output_images") - or qresult.get("outputs", {}).get("output_images") - or []) + output_images = ( + qresult.get("output_images") + or qresult.get("outputs", {}).get("output_images") + or [] + ) if not output_images: - raise Exception(f"SUCCEED 但无图片: {qresult}") + raise Exception(f"SUCCEED but no images: {qresult}") url = output_images[0] img_data = requests.get(url, timeout=180).content @@ -149,34 +143,30 @@ def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filena return {"url": url, "filepath": filepath} elif task_status == "FAILED": - raise Exception(f"ModelScope 任务失败: {qresult.get('errors', qresult)}") + raise Exception(f"ModelScope task failed: {qresult.get('errors', qresult)}") - raise Exception(f"ModelScope 超时({MODELSCOPE_MAX_WAIT}s)") + raise Exception(f"ModelScope timeout ({max_wait}s)") def image_generate( prompt: str, save_dir: str = "./generated_images", model_name: str = None, - n: int = 1, - seed: int = None, - num_inference_steps: int = 20, - guidance_scale: float = None, - negative_prompt: str = None, filename: str = None, image_size: str = None, + guidance_scale: float = None, + negative_prompt: str = None, ) -> dict: - """ - 统一文生图接口 + """Unified text-to-image interface. Args: - prompt: 生成提示词 - save_dir: 保存目录 - model_name: 模型名称(IMAGE_MODELS 的 key),默认用 config 中的 DEFAULT_IMAGE_MODEL - image_size: 图片尺寸,默认 1280x720(16:9) - + prompt: generation prompt + save_dir: output directory + model_name: model name (key in IMAGE_MODELS), None = default + filename: output filename, None = auto + image_size: image size, None = model default Returns: - dict: {"url": str, "filepath": str} + {"url": str, "filepath": str} """ from config import DEFAULT_IMAGE_MODEL @@ -185,31 +175,34 @@ def image_generate( model_config = IMAGE_MODELS.get(model_name) if not model_config: - raise ValueError(f"未知模型: {model_name},可选: {list(IMAGE_MODELS.keys())}") + raise ValueError(f"Unknown model: {model_name}, available: {list(IMAGE_MODELS.keys())}") + + api_key = model_config.get("api_key", "") + if not api_key: + raise ValueError( + f"API key not configured for '{model_name}'. " + f"Edit config.py and fill in the api_key field." + ) model_id = model_config["model"] size = image_size or model_config["default_size"] guidance = guidance_scale if guidance_scale is not None else model_config["guidance_scale"] neg = negative_prompt or NEGATIVE_PROMPT + provider = model_config["provider"] + api_base = model_config.get("api_base", "") os.makedirs(save_dir, exist_ok=True) - provider = model_config["provider"] if provider == "siliconflow": - return _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename) + return _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename, api_key, api_base) elif provider == "modelscope": - return _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename) + return _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename, api_key, api_base) else: - raise ValueError(f"未知 provider: {provider}") - - -def get_available_models() -> list[str]: - """返回可用的文生图模型名称列表""" - return list(IMAGE_MODELS.keys()) + raise ValueError(f"Unknown provider: {provider}") if __name__ == "__main__": - for name in get_available_models(): - print(f"\n测试模型: {name}") - result = image_generate("A cute cat sitting on a desk, 16:9 aspect ratio", model_name=name) - print(f" 路径: {result['filepath']}") + for name in list(IMAGE_MODELS.keys()): + print(f"\nTesting: {name}") + result = image_generate("A cute cat sitting on a desk, 16:9", model_name=name) + print(f" Path: {result['filepath']}") diff --git a/run.bat b/run.bat index 446fe93..302d4a3 100644 --- a/run.bat +++ b/run.bat @@ -3,23 +3,15 @@ chcp 65001 >nul cd /d "%~dp0" -:: 清除缓存 +:: clean cache del /s /q __pycache__\*.pyc 2>nul for /d %%d in (__pycache__) do rd /s /q "%%d" 2>nul -echo 正在激活 Videoer 环境并启动 GUI... +echo Starting VidMarmot... -call C:\ProgramData\anaconda3\Scripts\activate.bat Videoer -if errorlevel 1 ( - echo [错误] 无法激活 Videoer 环境 - pause - exit /b 1 -) - -cd /d "%~dp0" python "%~dp0gui.py" if errorlevel 1 ( echo. - echo [错误] 启动失败 + echo [ERROR] Startup failed. Make sure Python and dependencies are installed. pause ) diff --git a/scene_generate.py b/scene_generate.py index 993b6f8..8177f8d 100644 --- a/scene_generate.py +++ b/scene_generate.py @@ -84,7 +84,7 @@ def generate_single_scene( return scene -def main(workspace: str = None, model_name: str = "Kolors(便宜快速)"): +def main(workspace: str = None, model_name: str = "Kolors (SiliconFlow)"): """主流程:生成所有 pending 场景""" global WORKSPACE, PLAN_PATH, SCENE_IMG_DIR diff --git a/text_ai.py b/text_ai.py index 4232450..1be73d2 100644 --- a/text_ai.py +++ b/text_ai.py @@ -1,43 +1,44 @@ """ -text_ai.py - LLM 文本生成 -用于场景划分等 AI 推理任务 -支持多 LLM 提供商切换 +text_ai.py - LLM text generation client. + +Supports multiple providers defined in config.py. """ from openai import OpenAI -from config import LLM_PROVIDERS, DEFAULT_LLM, LLM_API_KEY, LLM_API_BASE, LLM_MODEL +from config import LLM_PROVIDERS def text_ai(in_put: str, system_prompt: str = "You are a helpful assistant.", provider: str = None) -> str: - """ - 调用 LLM 生成文本 + """Call LLM to generate text. Args: - in_put: 用户输入内容 - system_prompt: 系统提示词 - provider: LLM 提供商名称(对应 LLM_PROVIDERS 的 key),None 则用默认 - + in_put: user message + system_prompt: system prompt + provider: provider name (key in LLM_PROVIDERS), None = first in dict Returns: - AI 生成的文本 + generated text """ if provider and provider in LLM_PROVIDERS: cfg = LLM_PROVIDERS[provider] - api_key = cfg["api_key"] - api_base = cfg["api_base"] - model = cfg["model"] else: - api_key = LLM_API_KEY - api_base = LLM_API_BASE - model = LLM_MODEL + # Default to first provider in dict + cfg = next(iter(LLM_PROVIDERS.values())) + provider = next(iter(LLM_PROVIDERS)) - client = OpenAI( - api_key=api_key, - base_url=api_base, - ) + api_key = cfg["api_key"] + api_base = cfg["api_base"] + model = cfg["model"] - # ModelScope 的 Qwen3 系列和 GLM 系列默认开启 thinking,需要关掉 - # 注意:MiniMax 系列不是 Qwen/GLM,不需要也不能传 enable_thinking + if not api_key: + raise ValueError( + f"API key not configured for '{provider}'. " + f"Edit config.py and fill in the api_key field." + ) + + client = OpenAI(api_key=api_key, base_url=api_base) + + # ModelScope Qwen3/GLM default to thinking mode, disable it extra_body = {} is_modelscope = "modelscope" in api_base.lower() is_qwen = "qwen" in model.lower() @@ -49,38 +50,31 @@ def text_ai(in_put: str, system_prompt: str = "You are a helpful assistant.", model=model, messages=[ {"role": "system", "content": system_prompt}, - {"role": "user", "content": in_put} + {"role": "user", "content": in_put}, ], max_tokens=16384, stream=False, - extra_body=extra_body if extra_body else None, + extra_body=extra_body or None, ) - # 防御:choices 为空或 None if not response.choices: - # 尝试从 response 对象提取有用信息 resp_dict = response.model_dump() if hasattr(response, "model_dump") else {} error_msg = resp_dict.get("error", {}) - if isinstance(error_msg, dict): - err_text = error_msg.get("message", str(error_msg)) - else: - err_text = str(resp_dict) + err_text = error_msg.get("message", str(error_msg)) if isinstance(error_msg, dict) else str(resp_dict) raise ValueError( - f"模型 '{model}' 返回了空的 choices。\n" - f"响应内容: {err_text}\n" - f"可能是模型暂时不可用或请求被拒绝。" + f"Model '{model}' returned empty choices.\n" + f"Response: {err_text}\n" + f"Model may be unavailable or request was rejected." ) msg = response.choices[0].message content = msg.content - # 检测输出是否被截断 - finish = response.choices[0].finish_reason - if finish == "length": - print(f"[WARN] LLM output truncated (finish_reason=length), max_tokens may be too small") + if response.choices[0].finish_reason == "length": + print(f"[WARN] LLM output truncated (finish_reason=length)") if content is None: - # fallback:尝试多种字段名(不同 API 叫法不同) + # Fallback: try alternate field names for attr in ("thinking_content", "reasoning_content", "text", "output"): fallback = getattr(msg, attr, None) if fallback: @@ -88,7 +82,6 @@ def text_ai(in_put: str, system_prompt: str = "You are a helpful assistant.", break if content is None: - # 最后一搏:尝试把 message 对象当 dict 看 try: msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else vars(msg) for v in msg_dict.values(): @@ -99,11 +92,8 @@ def text_ai(in_put: str, system_prompt: str = "You are a helpful assistant.", pass if content is None: - finish = response.choices[0].finish_reason raise ValueError( - f"模型 '{model}' 返回内容为空(content=None)," - f"finish_reason={finish}。\n" - f"如果使用 MiniMax 系列,请改用 Qwen3.5-35B (ModelScope 免费) 或其他 Qwen 模型。" + f"Model '{model}' returned None content (finish_reason={response.choices[0].finish_reason})." ) return content