Initial commit: V1

This commit is contained in:
theliu
2026-04-25 12:50:36 +08:00
commit 4c38e240dc
12 changed files with 3746 additions and 0 deletions
+32
View File
@@ -0,0 +1,32 @@
# Python
__pycache__/
*.py[cod]
*.egg-info/
dist/
build/
*.egg
# Models (too large for git)
models/
# Workspace data (user-generated)
workspace/
# Backup
_backup/
# Environment
.env
venv/
.venv/
# IDE
.vscode/
.idea/
*.swp
*.swo
# OS
Thumbs.db
Desktop.ini
.DS_Store
+133
View File
@@ -0,0 +1,133 @@
# Videoer
**AI-powered video generation pipeline** — 从文章到视频的一站式工具。
给定一篇英文文章(文本)和对应的朗读音频,自动完成:
```
文章文本 + 朗读音频 → AI 场景划分 → 逐场景生成配图 → ASR 时间对齐 → 合成视频(含字幕)
```
## Preview
![Pipeline Overview](docs/pipeline.png)
## Features
- **AI Scene Planning** — 基于 LLMQwen / GLM)智能划分场景,提取角色、画面描述
- **AI Image Generation** — 支持 Kolors / Qwen-Image 文生图模型,逐张生成场景配图
- **Interactive Review** — 逐张审查、确认/重新生成场景图
- **Forced Alignment** — 基于 Qwen3-ForcedAligner 的语音-文本时间对齐
- **Video Synthesis** — MoviePy 合成最终视频,自动添加字幕
## Architecture
```
release1/
├── gui.py # PyQt6 GUI (main entry)
├── scene_plan.py # LLM scene planning + prompt engineering
├── image_gen.py # Text-to-image API calls
├── asr.py # ASR forced alignment
├── make_video.py # Video synthesis + subtitle rendering
├── text_ai.py # Shared LLM API client
├── config.py # Model paths, API keys, defaults
├── run.bat # Windows launcher
└── qwen_download.py # One-time model download script
```
## Workflow
```
1. Select workspace (folder with article.txt + voice.mp3)
2. AI Scene Planning → scene_plan.json
3. Image Generation → scene_01.png, scene_02.png, ...
4. ASR Alignment → result.json + timestamps into scene_plan
5. Video Synthesis → output_video.mp4
```
## Quick Start
### Prerequisites
- Python 3.12+
- Conda (recommended)
- NVIDIA GPU (for local ASR model)
### Setup
```bash
# Create conda environment
conda create -n Videoer python=3.12 -y
conda activate Videoer
# Install dependencies
pip install PyQt6 moviepy Pillow requests openai
pip install funasr modelscope torch torchaudio
# Download ASR model
python qwen_download.py
```
### Configuration
Edit `config.py` to set your API keys:
```python
# LLM providers (scene planning)
LLM_PROVIDERS = {
"Qwen3.5-35B (ModelScope)": {
"api_key": "YOUR_KEY",
...
},
...
}
# Image generation
SILICONFLOW_API_KEY = "YOUR_KEY"
MODELSCOPE_API_KEY = "YOUR_KEY"
```
> **Tip**: ModelScope and SiliconFlow both offer free-tier API keys.
### Run
```bash
# GUI mode (recommended)
python gui.py
# Or on Windows
run.bat
```
### Workspace Structure
Each video project lives in a workspace folder:
```
workspace/my_project/
├── article.txt # Source article text
├── voice.mp3 # Narration audio
├── scene_plan.json # Generated scene plan (auto)
├── result.json # ASR alignment result (auto)
├── scene_01.png # Generated images (auto)
├── scene_02.png
├── ...
└── output_video.mp4 # Final output (auto)
```
## Dependencies
| Package | Purpose |
|---------|---------|
| PyQt6 | GUI framework |
| moviepy | Video composition |
| Pillow | Image processing / subtitle rendering |
| requests | HTTP API calls |
| openai | Compatible LLM client (OpenAI API format) |
| funasr | ASR forced alignment |
| modelscope | Model loading |
| torch / torchaudio | GPU inference backend |
## License
MIT
+167
View File
@@ -0,0 +1,167 @@
"""
asr.py - ASR 强制对齐模块(简化版)
使用 Qwen3-ForcedAligner 对齐音频和文本
模型路径通过 config.py 中的绝对路径指向
"""
import os
import json
import re
def _detect_language(text: str) -> str:
"""根据文本字符分布自动检测语言"""
if not text:
return "English"
# 统计非 ASCII 字符(中文等)
non_ascii = sum(1 for c in text if ord(c) >= 0x4e00)
ratio = non_ascii / len(text)
return "Chinese" if ratio > 0.1 else "English"
def run_asr(workspace: str, language: str = None) -> dict:
"""
执行 ASR 强制对齐
Args:
workspace: 工作区路径(含 article.txt 和 voice.mp3
Returns:
dict: {"audio": str, "text": str, "segments": list}
"""
from config import ASR_MODEL_DIR
from qwen_asr import Qwen3ForcedAligner
audio_path = os.path.join(workspace, "voice.mp3")
text_path = os.path.join(workspace, "article.txt")
output_path = os.path.join(workspace, "result.json")
# 验证路径
if not os.path.exists(ASR_MODEL_DIR):
raise FileNotFoundError(f"ASR 模型路径不存在: {ASR_MODEL_DIR}")
if not os.path.exists(audio_path):
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
if not os.path.exists(text_path):
raise FileNotFoundError(f"文本文件不存在: {text_path}")
# 读取文本
with open(text_path, 'r', encoding='utf-8') as f:
text = f.read().strip()
print(f"[ASR] 文本长度: {len(text)} 字符")
print(f"[ASR] 音频文件: {audio_path}")
print(f"[ASR] 模型路径: {ASR_MODEL_DIR}")
# 加载模型
print("[ASR] 正在加载模型...")
aligner = Qwen3ForcedAligner.from_pretrained(
ASR_MODEL_DIR,
local_files_only=True,
device_map="cpu"
)
print("[ASR] 模型加载成功")
# 自动检测语言(如果未指定)
if language is None:
language = _detect_language(text)
print(f"[ASR] 检测语言: {language}")
# 运行对齐
print("[ASR] 正在对齐...")
results = aligner.align(
audio=audio_path,
text=text,
language=language
)
# 整理结果
segments = []
for result in results:
for item in result.items:
segments.append({
"text": item.text,
"start": round(item.start_time, 3),
"end": round(item.end_time, 3)
})
output_data = {
"audio": audio_path,
"text": text,
"segments": segments
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"[ASR] 完成,共 {len(segments)} 个片段,保存到 {output_path}")
return output_data
def match_scenes_to_audio(workspace: str) -> dict:
"""
将 ASR segments 与 scene_plan 的 scenes 做文本匹配,
给每个 scene 写入 start_time / end_time,并更新 scene_plan.json。
Returns:
dict: 更新后的 scene_plan(带时间信息)
"""
from make_video import load_scene_plan, load_asr_result, assign_scenes_to_segments
from scene_plan import _get_audio_duration
plan_path = os.path.join(workspace, "scene_plan.json")
result_path = os.path.join(workspace, "result.json")
audio_path = os.path.join(workspace, "voice.mp3")
if not os.path.exists(plan_path):
raise FileNotFoundError(f"scene_plan.json 不存在: {plan_path}")
if not os.path.exists(result_path):
raise FileNotFoundError(f"result.json 不存在: {result_path}")
if not os.path.exists(audio_path):
raise FileNotFoundError(f"voice.mp3 不存在: {audio_path}")
scene_plan = load_scene_plan(workspace)
asr_result = load_asr_result(workspace)
audio_duration = _get_audio_duration(audio_path)
if audio_duration is None:
raise RuntimeError(f"无法获取音频时长: {audio_path}")
# 恢复每个场景的文本用于 ASR 匹配
# 优先用 text 字段(原文片段),兼容旧 lines 字段
for scene in scene_plan["scenes"]:
text_val = scene.get("text", "")
lines_val = scene.get("lines", "")
raw = text_val or lines_val or ""
if isinstance(raw, str) and raw:
# 按句号分句供匹配
scene["lines"] = [s.strip() for s in raw.split("") if s.strip()]
else:
scene["lines"] = []
# 复用 make_video 的匹配逻辑
timeline = assign_scenes_to_segments(
scene_plan["scenes"], asr_result.get("segments", []), audio_duration
)
# 写回 scene_plan
for item in timeline:
scene_id = item["scene_id"]
for scene in scene_plan["scenes"]:
if scene["scene_id"] == scene_id:
scene["start_time"] = round(item["start"], 3)
scene["end_time"] = round(item["end"], 3)
break
with open(plan_path, 'w', encoding='utf-8') as f:
json.dump(scene_plan, f, ensure_ascii=False, indent=2)
print(f"[匹配] 完成,已将时间信息写入 {plan_path}")
for item in timeline:
dur = item["end"] - item["start"]
print(f" Scene {item['scene_id']:2d}: {item['start']:6.2f}s - {item['end']:6.2f}s ({dur:.2f}s)")
return scene_plan
if __name__ == "__main__":
run_asr(os.path.join(os.path.dirname(__file__), "workspace", "1"))
+81
View File
@@ -0,0 +1,81 @@
"""
release1 配置文件
集中管理所有模型路径、API Key、默认参数
"""
import os
# ========== 基础路径 ==========
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
VIDEO_PROJECT_DIR = os.path.dirname(BASE_DIR) # 上级 video/ 目录
# ========== ASR 模型(绝对路径指向 video/models/==========
ASR_MODEL_DIR = os.path.join(
r'C:\pythonproject\video', 'models', 'qwen', 'Qwen3-ForcedAligner-0.6B'
).replace('\\', '/')
# ========== LLM 提供商(划分场景/角色提取用)==========
LLM_PROVIDERS = {
"Qwen3.5-35B (ModelScope 免费)": {
"api_key": "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6",
"api_base": "https://api-inference.modelscope.cn/v1/",
"model": "Qwen/Qwen3.5-35B-A3B",
},
"GLM-4-9B (硅基流动 免费)": {
"api_key": "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq",
"api_base": "https://api.siliconflow.cn/v1/",
"model": "THUDM/glm-4-9b-chat",
},
"Qwen3-32B (硅基流动 付费)": {
"api_key": "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq",
"api_base": "https://api.siliconflow.cn/v1/",
"model": "Qwen/Qwen3-32B",
},
"GLM-5 (ModelScope 免费)": {
"api_key": "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6",
"api_base": "https://api-inference.modelscope.cn/v1/",
"model": "ZhipuAI/GLM-5",
},
}
# 默认 LLM(兼容旧代码)
DEFAULT_LLM = "Qwen3.5-35B (ModelScope 免费)"
LLM_API_KEY = LLM_PROVIDERS[DEFAULT_LLM]["api_key"]
LLM_API_BASE = LLM_PROVIDERS[DEFAULT_LLM]["api_base"]
LLM_MODEL = LLM_PROVIDERS[DEFAULT_LLM]["model"]
# ========== SiliconFlow APIKolors 文生图)==========
SILICONFLOW_API_KEY = "sk-mjqgwknbttvqnrjjfnxemtjgdivogjaqsftbvoifwjvruwsq"
SILICONFLOW_API_BASE = "https://api.siliconflow.cn/v1/images/generations"
# ========== ModelScope APIQwen 文生图)==========
MODELSCOPE_API_KEY = "ms-38de567b-cf88-4523-bac2-ff63d8f1e0f6"
MODELSCOPE_API_BASE = "https://api-inference.modelscope.cn/v1/images/generations"
MODELSCOPE_POLL_INTERVAL = 3 # 轮询间隔(秒)
MODELSCOPE_MAX_WAIT = 180 # 最大等待时间(秒)
# ========== 文生图模型 ==========
IMAGE_MODELS = {
"Kolors(便宜快速)": {
"provider": "siliconflow",
"model": "Kwai-Kolors/Kolors",
"default_size": "1280x720",
"guidance_scale": 7.5,
},
"Qwen-Image(高质量)": {
"provider": "modelscope",
"model": "Qwen/Qwen-Image-2512",
"default_size": "1280x720",
"guidance_scale": 7.5,
},
}
# 默认文生图模型
DEFAULT_IMAGE_MODEL = "Kolors(便宜快速)"
# ========== 默认参数 ==========
DEFAULT_FPS = 24
DEFAULT_VIDEO_SIZE = "1280x720"
# ========== 通用 negative prompt ==========
NEGATIVE_PROMPT = "blurry, low quality, deformed, text, letters, words, subtitle, logo, watermark, caption, label, number"
+1486
View File
File diff suppressed because it is too large Load Diff
+215
View File
@@ -0,0 +1,215 @@
"""
image_gen.py - 统一文生图接口
支持两个模型:
- Kolors(便宜快速)→ SiliconFlow API(同步)
- Qwen-Image(高质量)→ ModelScope API(异步轮询)
"""
import requests
import os
import time
from datetime import datetime
from config import (
SILICONFLOW_API_KEY,
SILICONFLOW_API_BASE,
MODELSCOPE_API_KEY,
MODELSCOPE_API_BASE,
MODELSCOPE_POLL_INTERVAL,
MODELSCOPE_MAX_WAIT,
IMAGE_MODELS,
NEGATIVE_PROMPT,
)
def _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename):
"""SiliconFlow 同步 APIKolors"""
payload = {
"model": model_id,
"prompt": prompt,
"image_size": size,
"n": 1,
"num_inference_steps": 20,
"guidance_scale": guidance,
"negative_prompt": neg,
}
headers = {
"Authorization": f"Bearer {SILICONFLOW_API_KEY}",
"Content-Type": "application/json",
}
print(f" [SiliconFlow] 提交: {prompt[:60]}{'...' if len(prompt) > 60 else ''}")
for attempt in range(6): # 最多重试 5 次
resp = requests.post(SILICONFLOW_API_BASE, headers=headers, json=payload, timeout=120)
print(f" HTTP {resp.status_code}: {resp.text[:300]}")
if resp.status_code == 429:
wait = 15 * (attempt + 1) # 15s, 30s, 45s, 60s, 75s
print(f" [!] 限频,等待 {wait}s 后重试 ({attempt+1}/5)...")
time.sleep(wait)
continue
if resp.status_code != 200:
raise Exception(f"SiliconFlow 生成失败 ({resp.status_code}): {resp.text[:300]}")
break
else:
raise Exception("SiliconFlow 持续限频,已重试 5 次,请稍后再试或切换模型")
result = resp.json()
images = result.get("images", [])
if not images:
raise Exception(f"SiliconFlow 返回无图片: {result}")
img_url = images[0].get("url")
if not img_url:
raise Exception(f"返回图片 URL 为空: {result}")
img_data = requests.get(img_url, timeout=60).content
if filename is None:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"kolors_{ts}.png"
filepath = os.path.join(save_dir, filename)
with open(filepath, "wb") as f:
f.write(img_data)
print(f" [OK] {filename}")
return {"url": img_url, "filepath": filepath}
def _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename):
"""ModelScope 异步轮询 APIQwen-Image"""
submit_headers = {
"Authorization": f"Bearer {MODELSCOPE_API_KEY}",
"Content-Type": "application/json",
"X-ModelScope-Async-Mode": "true"
}
payload = {
"model": model_id,
"prompt": prompt,
"n": 1,
"size": size,
"guidance_scale": guidance,
"negative_prompt": neg,
}
print(f" [ModelScope] 提交: {prompt[:60]}{'...' if len(prompt) > 60 else ''}")
resp = requests.post(MODELSCOPE_API_BASE, headers=submit_headers, json=payload, timeout=60)
if resp.status_code != 200:
raise Exception(f"ModelScope 提交失败 ({resp.status_code}): {resp.text[:300]}")
result = resp.json()
task_id = result.get("task_id")
if not task_id:
raise Exception(f"未找到 task_id: {result}")
print(f" task_id: {task_id}")
# 轮询结果
query_headers = {
"Authorization": f"Bearer {MODELSCOPE_API_KEY}",
"X-ModelScope-Task-Type": "image_generation"
}
status_url = f"https://api-inference.modelscope.cn/v1/tasks/{task_id}"
start = time.time()
for attempt in range(100):
if attempt > 0:
time.sleep(MODELSCOPE_POLL_INTERVAL)
elapsed = int(time.time() - start)
if elapsed > MODELSCOPE_MAX_WAIT:
raise Exception(f"ModelScope 超时({MODELSCOPE_MAX_WAIT}s")
qresp = requests.get(status_url, headers=query_headers, timeout=30)
if qresp.status_code != 200:
continue
qresult = qresp.json()
task_status = qresult.get("task_status", "")
if attempt % 5 == 0 or task_status in ("SUCCEED", "FAILED"):
print(f" [{elapsed}s] {task_status}")
if task_status == "SUCCEED":
output_images = (qresult.get("output_images")
or qresult.get("outputs", {}).get("output_images")
or [])
if not output_images:
raise Exception(f"SUCCEED 但无图片: {qresult}")
url = output_images[0]
img_data = requests.get(url, timeout=180).content
if filename is None:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"qwen_{ts}.png"
filepath = os.path.join(save_dir, filename)
with open(filepath, "wb") as f:
f.write(img_data)
print(f" [OK] {filename} ({elapsed}s)")
return {"url": url, "filepath": filepath}
elif task_status == "FAILED":
raise Exception(f"ModelScope 任务失败: {qresult.get('errors', qresult)}")
raise Exception(f"ModelScope 超时({MODELSCOPE_MAX_WAIT}s")
def image_generate(
prompt: str,
save_dir: str = "./generated_images",
model_name: str = None,
n: int = 1,
seed: int = None,
num_inference_steps: int = 20,
guidance_scale: float = None,
negative_prompt: str = None,
filename: str = None,
image_size: str = None,
) -> dict:
"""
统一文生图接口
Args:
prompt: 生成提示词
save_dir: 保存目录
model_name: 模型名称(IMAGE_MODELS 的 key),默认用 config 中的 DEFAULT_IMAGE_MODEL
image_size: 图片尺寸,默认 1280x72016:9
Returns:
dict: {"url": str, "filepath": str}
"""
from config import DEFAULT_IMAGE_MODEL
if model_name is None:
model_name = DEFAULT_IMAGE_MODEL
model_config = IMAGE_MODELS.get(model_name)
if not model_config:
raise ValueError(f"未知模型: {model_name},可选: {list(IMAGE_MODELS.keys())}")
model_id = model_config["model"]
size = image_size or model_config["default_size"]
guidance = guidance_scale if guidance_scale is not None else model_config["guidance_scale"]
neg = negative_prompt or NEGATIVE_PROMPT
os.makedirs(save_dir, exist_ok=True)
provider = model_config["provider"]
if provider == "siliconflow":
return _generate_siliconflow(prompt, model_id, size, guidance, neg, save_dir, filename)
elif provider == "modelscope":
return _generate_modelscope(prompt, model_id, size, guidance, neg, save_dir, filename)
else:
raise ValueError(f"未知 provider: {provider}")
def get_available_models() -> list[str]:
"""返回可用的文生图模型名称列表"""
return list(IMAGE_MODELS.keys())
if __name__ == "__main__":
for name in get_available_models():
print(f"\n测试模型: {name}")
result = image_generate("A cute cat sitting on a desk, 16:9 aspect ratio", model_name=name)
print(f" 路径: {result['filepath']}")
+745
View File
@@ -0,0 +1,745 @@
"""
make_video.py - 场景图 + ASR 时间戳 + 音频 → 视频
流程:
1. 读取 voice.mp3 获取总时长
2. 读取 scene_plan.json 获取场景列表
3. 如果有 result.json(ASR 对齐),按对齐结果分配时间;否则平均分配
4. 用 moviepy 将场景图在对应时段显示,配上音频生成视频
"""
import os
import json
import math
import re
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from moviepy import (
AudioFileClip,
ImageClip,
CompositeVideoClip,
ColorClip,
concatenate_videoclips
)
def load_scene_plan(workspace: str) -> dict:
"""读取 scene_plan.json"""
path = os.path.join(workspace, 'scene_plan.json')
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_asr_result(workspace: str) -> dict:
"""读取 result.json(ASR 对齐结果),如果不存在返回 None"""
path = os.path.join(workspace, "result.json")
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
return None
def build_scene_timeline(scene_plan: dict, asr_result: dict, audio_duration: float,
workspace: str = None) -> list:
"""
构建场景时间线。优先使用 scene 中已有的 start_time/end_timeASR 步骤写入),
没有则根据 ASR 文本匹配分配,再没有则平均分配。
"""
scenes = scene_plan["scenes"]
num_scenes = len(scenes)
# --- 策略 1:直接使用 scene 已有的时间戳(ASR 步骤写入的) ---
if all("start_time" in s and "end_time" in s for s in scenes):
timeline = []
for scene in scenes:
timeline.append({
"scene_id": scene["scene_id"],
"start": scene["start_time"],
"end": scene["end_time"],
"has_match": True,
})
# 确保最后一个场景延伸到音频结尾
if timeline:
timeline[-1]["end"] = audio_duration
return timeline
# --- 策略 2:部分有时间戳,部分没有 ---
has_timestamp = [s for s in scenes if "start_time" in s and "end_time" in s]
if has_timestamp and len(has_timestamp) < num_scenes:
timeline = []
matched_end = 0.0
for scene in scenes:
if "start_time" in scene and "end_time" in scene:
timeline.append({
"scene_id": scene["scene_id"],
"start": scene["start_time"],
"end": scene["end_time"],
})
matched_end = max(matched_end, scene["end_time"])
else:
timeline.append({
"scene_id": scene["scene_id"],
"start": None,
"end": None,
})
# 未匹配的场景按剩余时间平均分配
unmatched_indices = [i for i, item in enumerate(timeline) if item["start"] is None]
if unmatched_indices:
remaining = audio_duration - matched_end
if remaining > 0:
seg_dur = remaining / len(unmatched_indices)
t = matched_end
for idx in unmatched_indices:
timeline[idx]["start"] = t
timeline[idx]["end"] = t + seg_dur
t += seg_dur
else:
for idx in unmatched_indices:
timeline[idx]["start"] = matched_end
timeline[idx]["end"] = matched_end + 2.0
if timeline:
timeline[-1]["end"] = audio_duration
return timeline
# --- 策略 3:没有任何时间戳,用 ASR 文本匹配或平均分配 ---
if asr_result is not None:
# 准备匹配用的文本
scenes_for_match = []
for scene in scenes:
s = dict(scene)
text_val = s.get("text", "")
lines_val = s.get("lines", "")
raw = text_val or lines_val or ""
if isinstance(raw, str) and raw:
s["lines"] = [x.strip() for x in raw.split("") if x.strip()]
elif isinstance(raw, list):
s["lines"] = raw
else:
s["lines"] = []
scenes_for_match.append(s)
asr_segments = asr_result.get("segments", [])
timeline = assign_scenes_to_segments(scenes_for_match, asr_segments, audio_duration)
else:
# 平均分配
duration_per_scene = audio_duration / num_scenes
timeline = []
current_time = 0.0
for scene in scenes:
start = current_time
end = min(current_time + duration_per_scene, audio_duration)
timeline.append({
"scene_id": scene["scene_id"],
"start": start,
"end": end,
})
current_time = end
return timeline
def _normalize_text(text: str) -> str:
"""清理文本:去除标点、空格、转小写,用于相似度比较"""
if not text:
return ""
# 去除所有非字母数字和非中文字符
text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
return text.lower()
def _longest_common_substring_ratio(s1: str, s2: str) -> float:
"""
计算两个字符串的最长公共子串比例
使用动态规划,但为了性能优化:
- 如果字符串太长,使用简化的滑动窗口方法
"""
if not s1 or not s2:
return 0.0
# 对于短字符串,使用精确的 DP 算法
if len(s1) * len(s2) < 100000: # 乘积小于 10万
m, n = len(s1), len(s2)
# 优化空间复杂度:只用两行
prev = [0] * (n + 1)
curr = [0] * (n + 1)
max_len = 0
for i in range(1, m + 1):
for j in range(1, n + 1):
if s1[i-1] == s2[j-1]:
curr[j] = prev[j-1] + 1
max_len = max(max_len, curr[j])
else:
curr[j] = 0
prev, curr = curr, [0] * (n + 1)
return max_len / max(len(s1), len(s2))
# 对于长字符串,使用简化的滑动窗口
return _sliding_window_similarity(s1, s2)
def _sliding_window_similarity(s1: str, s2: str, window_size: int = 20) -> float:
"""
滑动窗口相似度(用于长字符串的快速估算)
将 s1 分成多个窗口,在 s2 中查找最佳匹配
"""
if len(s1) <= window_size:
# 如果 s1 本身很短,直接检查是否在 s2 中
if s1 in s2:
return 1.0
# 否则检查每个字符的出现
common = sum(1 for c in s1 if c in s2)
return common / len(s1) if s1 else 0.0
# 分窗口检查
total_score = 0.0
num_windows = 0
for i in range(0, len(s1) - window_size + 1, window_size // 2):
window = s1[i:i+window_size]
if window in s2:
total_score += 1.0
else:
# 部分匹配
common = sum(1 for c in window if c in s2)
total_score += common / window_size
num_windows += 1
return total_score / num_windows if num_windows > 0 else 0.0
def assign_scenes_to_segments(scenes: list, asr_segments: list, audio_duration: float) -> list:
"""
基于文本相似度的精确匹配
策略:
1. 每个 scene 有 text 字段(原文片段)
2. 在 ASR segments 中搜索这段文本的出现位置
3. 使用最长公共子串相似度容忍 ASR 识别错误
4. 阈值 60%,避免误匹配
"""
SIMILARITY_THRESHOLD = 0.5 # 相似度阈值
MAX_WINDOW_SIZE = 20 # 最多合并 20 个 ASR segments(长句需要更多)
# 预处理:清理 ASR segments 的文本
asr_cleaned = []
for seg in asr_segments:
text = seg.get('text', '').strip()
if text:
asr_cleaned.append({
'text': text,
'text_normalized': _normalize_text(text),
'start': seg['start'],
'end': seg['end']
})
def find_best_match(scene_text: str, start_seg_idx: int):
"""
找到 scene_text 在 ASR segments 中的最佳匹配位置
使用滑动窗口尝试不同长度的 segment 组合
"""
if not scene_text or not scene_text.strip():
return None
scene_normalized = _normalize_text(scene_text)
best_score = 0.0
best_match = None
# 滑动窗口:尝试不同数量的 segment 组合
for window_size in range(1, min(MAX_WINDOW_SIZE + 1, len(asr_cleaned) - start_seg_idx + 1)):
for i in range(start_seg_idx, len(asr_cleaned) - window_size + 1):
# 合并多个 segments 的文本
combined_text = ''.join(
seg['text_normalized'] for seg in asr_cleaned[i:i+window_size]
)
# 计算相似度
similarity = _longest_common_substring_ratio(scene_normalized, combined_text)
if similarity > best_score:
best_score = similarity
best_match = {
'start': asr_cleaned[i]['start'],
'end': asr_cleaned[i + window_size - 1]['end'],
'start_idx': i,
'end_idx': i + window_size - 1,
'similarity': similarity
}
# 如果已经达到很高相似度,提前返回
if similarity >= 0.90:
return best_match
# 只有达到阈值才算匹配成功
if best_score >= SIMILARITY_THRESHOLD:
return best_match
return None
# 主匹配循环
timeline_raw = []
last_end_idx = 0 # 记录上一个匹配结束的位置
for scene in scenes:
# 优先使用 text 字段,兼容旧 lines 字段
scene_text = scene.get('text', '') or scene.get('lines', '')
if isinstance(scene_text, list):
scene_text = ''.join(scene_text)
match = find_best_match(scene_text, last_end_idx)
if match:
timeline_raw.append({
'scene_id': scene['scene_id'],
'start': match['start'],
'end': match['end'],
'has_match': True,
'similarity': match['similarity']
})
# 下一个场景从当前匹配结束后开始
last_end_idx = match['end_idx'] + 1
else:
timeline_raw.append({
'scene_id': scene['scene_id'],
'start': None,
'end': None,
'has_match': False,
'similarity': 0.0
})
# 处理未匹配的场景:按剩余音频时间平均分配
unmatched_indices = [i for i, item in enumerate(timeline_raw) if not item['has_match']]
if unmatched_indices:
# 找到已匹配场景的最大结束时间
matched_end = 0.0
for item in timeline_raw:
if item['has_match'] and item['end']:
matched_end = max(matched_end, item['end'])
remaining_duration = audio_duration - matched_end
if remaining_duration > 0 and len(unmatched_indices) > 0:
seg_dur = remaining_duration / len(unmatched_indices)
current_t = matched_end
for idx in unmatched_indices:
timeline_raw[idx]['start'] = current_t
timeline_raw[idx]['end'] = current_t + seg_dur
timeline_raw[idx]['has_match'] = True
current_t += seg_dur
else:
# 如果没有剩余时间,给一个默认时长
for idx in unmatched_indices:
timeline_raw[idx]['start'] = matched_end
timeline_raw[idx]['end'] = matched_end + 2.0
timeline_raw[idx]['has_match'] = True
# 确保每个场景至少有 0.5 秒
for item in timeline_raw:
if item['end'] is None or item['end'] <= item['start']:
item['end'] = item['start'] + 0.5
# 消除间隙:确保时间连续
for i in range(len(timeline_raw) - 1):
curr = timeline_raw[i]
nxt = timeline_raw[i + 1]
gap = nxt['start'] - curr['end']
if gap > 0.01: # 允许 10ms 的微小间隙
curr['end'] = nxt['start']
# 最后一个场景延伸到音频结尾
if timeline_raw:
timeline_raw[-1]['end'] = audio_duration
# 合并极短的场景(< 1秒)到下一个场景
merged = []
skip_next = False
for i in range(len(timeline_raw)):
if skip_next:
skip_next = False
continue
item = dict(timeline_raw[i])
duration = item['end'] - item['start']
# 如果当前场景太短且不是最后一个,合并到下一个
if duration < 1.0 and i < len(timeline_raw) - 1:
item['end'] = timeline_raw[i + 1]['end']
merged.append(item)
skip_next = True
else:
merged.append(item)
return merged
def find_scene_image(workspace: str, scene_id: int) -> str:
"""查找场景图"""
candidates = [
os.path.join(workspace, "scene", f"scene_{scene_id:03d}.png"),
os.path.join(workspace, "scene", f"scene_{scene_id}.png"),
]
for path in candidates:
if os.path.exists(path):
return path
return None
def create_video(workspace: str, timeline: list, audio_path: str, output_path: str,
fps: int = 24, img_size: tuple = (1280, 720),
subtitle: bool = True, log_fn=None):
"""
用 moviepy 创建视频
Args:
workspace: 工作区路径
timeline: 场景时间线
audio_path: 音频文件路径
output_path: 输出视频路径
fps: 帧率
img_size: 视频尺寸 (w, h)
subtitle: 是否添加字幕
log_fn: 日志回调函数
"""
def log(msg):
if log_fn:
log_fn(msg)
else:
print(msg)
clips = []
for item in timeline:
scene_id = item["scene_id"]
start = item["start"]
end = item["end"]
duration = end - start
if duration <= 0:
continue
img_path = find_scene_image(workspace, scene_id)
if img_path:
img_clip = ImageClip(img_path).with_duration(duration)
img_clip = img_clip.resized(new_size=img_size)
else:
log(f" [警告] scene_{scene_id} 无图片,使用纯色背景")
img_clip = ColorClip(size=img_size, color=(50, 50, 50)).with_duration(duration)
img_clip = img_clip.with_start(start)
clips.append(img_clip)
# 字幕
if subtitle:
asr_path = os.path.join(workspace, "result.json")
if os.path.exists(asr_path):
with open(asr_path, encoding="utf-8") as f:
asr_result = json.load(f)
segments = asr_result.get("segments", [])
log(f"[*] 原始 ASR 片段: {len(segments)}")
# 将 ASR segments 按时间分组(每段 3-5 秒,避免字幕堆满屏幕)
def group_segments_into_sentences(segments: list) -> list:
"""
将短的 ASR segments 合并成合理的字幕组。
支持中文(字级)和英文(词级)ASR 输出。
限制条件:
- 每组最多 4 秒时长
- 每组最多 40 个字符(中文)/ 60 个字符(英文,含空格)
- 遇到句号等结束标点时提前分组
"""
if not segments:
return []
sentence_endings = set('。!?.!?…\n')
MAX_CHARS = 60 # 字符上限(英文含空格时会更长)
MAX_DURATION = 4.0
def flush(text, start, end):
if text and start is not None:
grouped.append({"text": text.strip(), "start": start, "end": end})
grouped = []
current_words = [] # 词/字列表
current_start = None
current_end = None
for seg in segments:
text = seg.get("text", "").strip()
if not text:
continue
seg_start = seg.get("start", 0)
seg_end = seg.get("end", 0)
if current_start is None:
current_start = seg_start
# 判断加入当前词后是否超限
# 用空格拼英文,中文直接拼(中文 seg 通常每个词自带空格或无需加)
preview_words = current_words + [text]
# 根据是否含非 ASCII 字符判断是中文还是英文
is_ascii_word = all(ord(c) < 128 for c in text)
if is_ascii_word:
preview_text = " ".join(preview_words)
else:
preview_text = "".join(preview_words)
preview_duration = seg_end - current_start
# 先检查:加入后是否超限 → 如果是,先 flush 旧组,再开新组
overflow = (
len(preview_text) > MAX_CHARS or
preview_duration > MAX_DURATION
)
if overflow and current_words:
# flush 当前组(不含新词)
if is_ascii_word:
flush(" ".join(current_words), current_start, current_end)
else:
flush("".join(current_words), current_start, current_end)
current_words = [text]
current_start = seg_start
current_end = seg_end
else:
current_words.append(text)
current_end = seg_end
# 如果当前词是句尾标点,立刻 flush
if text and text[-1] in sentence_endings:
if is_ascii_word:
flush(" ".join(current_words), current_start, current_end)
else:
flush("".join(current_words), current_start, current_end)
current_words = []
current_start = None
current_end = None
# flush 剩余
if current_words:
if all(ord(c) < 128 for c in current_words[0]):
flush(" ".join(current_words), current_start or 0, current_end or 0)
else:
flush("".join(current_words), current_start or 0, current_end or 0)
return grouped
# 按句子分组
sentence_groups = group_segments_into_sentences(segments)
log(f"[*] 添加字幕,共 {len(sentence_groups)} 个句子(从 {len(segments)} 个片段合并)")
w, h = img_size
font_size = max(int(h * 0.045), 20)
margin_bottom = int(h * 0.10)
max_chars_per_line = max(int(w / (font_size * 0.85)), 15)
# 字体:优先黑体,fallback arial
font_path = "C:/Windows/Fonts/simhei.ttf"
if not os.path.exists(font_path):
font_path = "C:/Windows/Fonts/arial.ttf"
try:
pil_font = ImageFont.truetype(font_path, font_size)
except Exception:
pil_font = ImageFont.load_default()
# 字体度量
tmp_img = Image.new("RGBA", (1, 1))
tmp_draw = ImageDraw.Draw(tmp_img)
metrics = tmp_draw.textbbox((0, 0), "gypj8q", font=pil_font)
font_full_height = metrics[3] - metrics[1]
def wrap_text(text: str, max_chars: int) -> list:
"""按字符数拆分多行,优先在标点处断行"""
if len(text) <= max_chars:
return [text]
lines = []
while len(text) > max_chars:
cut = max_chars
for i in range(max_chars, max(max_chars - 5, 0), -1):
if text[i] in ",。!?、;:\u201c\u201d\u2018\u2019\u2026\u2014,.!?;: ":
cut = i + 1
break
lines.append(text[:cut])
text = text[cut:]
if text:
lines.append(text)
return lines
def render_subtitle_pil(text_lines: list) -> np.ndarray:
"""用 PIL 渲染多行字幕,返回 RGBA numpy 数组"""
line_spacing = int(font_size * 0.35)
line_height = font_full_height + line_spacing
line_widths = []
for line in text_lines:
bbox = tmp_draw.textbbox((0, 0), line, font=pil_font)
line_widths.append(bbox[2] - bbox[0])
img_w = max(line_widths) + 6
img_h = len(text_lines) * line_height + int(font_size * 0.3)
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
for idx, line in enumerate(text_lines):
x = (img_w - line_widths[idx]) // 2
y = idx * line_height
for dx in (-2, -1, 0, 1, 2):
for dy in (-2, -1, 0, 1, 2):
if dx == 0 and dy == 0:
continue
draw.text((x + dx, y + dy), line, fill=(0, 0, 0, 255), font=pil_font)
draw.text((x, y), line, fill=(255, 255, 255, 255), font=pil_font)
return np.array(img)
# 按句子显示字幕(而不是逐个片段)
for sentence in sentence_groups:
text = sentence["text"].strip()
if not text:
continue
seg_start = sentence["start"]
seg_end = sentence["end"]
seg_dur = seg_end - seg_start
if seg_dur <= 0:
continue
lines = wrap_text(text, max_chars_per_line)
rgba = render_subtitle_pil(lines)
txt_clip = ImageClip(rgba)
txt_clip = txt_clip.with_position(("center", h - margin_bottom - rgba.shape[0]))
txt_clip = txt_clip.with_duration(seg_dur).with_start(seg_start)
clips.append(txt_clip)
else:
log("[提示] 未找到 result.json,跳过字幕")
if not clips:
raise ValueError("没有可用的场景片段!")
# 加载音频
audio = AudioFileClip(audio_path)
# 合成视频
video = CompositeVideoClip(clips, size=img_size)
video = video.with_audio(audio)
video = video.with_duration(audio.duration)
# 导出
video.write_videofile(
output_path,
fps=fps,
codec='libx264',
audio_codec='aac',
bitrate='8000k',
)
video.close()
log(f"[OK] 视频已保存: {output_path}")
def main(workspace: str = None, fps: int = 24, size: str = "1280x720",
subtitle: bool = True, log_fn=None):
"""
主入口
Args:
workspace: 工作区路径
fps: 帧率
size: 视频尺寸 "WxH"
subtitle: 是否添加字幕
log_fn: 日志回调
"""
from config import DEFAULT_VIDEO_SIZE
if workspace is None:
workspace = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workspace", "1")
w, h = map(int, (size or DEFAULT_VIDEO_SIZE).split("x"))
img_size = (w, h)
def log(msg):
if log_fn:
log_fn(msg)
else:
print(msg)
log(f"[视频合成] 工作区: {workspace}")
log(f"[视频合成] 尺寸: {img_size}, FPS: {fps}")
# 1. 加载 scene_plan
scene_plan = load_scene_plan(workspace)
num_scenes = len(scene_plan["scenes"])
log(f"[视频合成] 共 {num_scenes} 个场景")
# 2. 加载音频
audio_path = os.path.join(workspace, "voice.mp3")
if not os.path.exists(audio_path):
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
# 使用 try-finally 确保音频资源正确释放
audio = AudioFileClip(audio_path)
try:
audio_duration = audio.duration
log(f"[视频合成] 音频时长: {audio_duration:.2f}")
finally:
audio.close()
# 3. 读取 ASR 结果
asr_result = load_asr_result(workspace)
if asr_result:
seg_count = len(asr_result.get("segments", []))
log(f"[视频合成] ASR: {seg_count} 个片段")
else:
log("[视频合成] 无 ASR 结果,按平均分配")
# 4. 构建时间线
timeline = build_scene_timeline(scene_plan, asr_result, audio_duration, workspace=workspace)
log("[视频合成] 场景时间分配:")
matched_count = 0
unmatched_count = 0
for item in timeline:
dur = item['end'] - item['start']
# 显示匹配质量
similarity = item.get('similarity', 0.0)
if similarity > 0:
quality = f"[{similarity:.0%}]"
matched_count += 1
elif item.get('has_match', False):
quality = "[OK]"
matched_count += 1
else:
quality = "[兜底]"
unmatched_count += 1
log(f" Scene {item['scene_id']:2d}: {item['start']:6.2f}s - {item['end']:6.2f}s ({dur:.2f}s) {quality}")
if unmatched_count > 0:
log(f"[警告] {unmatched_count}/{len(timeline)} 个场景未精确匹配,使用兜底分配")
else:
log(f"[OK] 所有场景均成功匹配")
# 5. 生成视频
has_image = sum(1 for item in timeline if find_scene_image(workspace, item["scene_id"]) is not None)
log(f"[视频合成] 有图片场景: {has_image}/{len(timeline)}")
if has_image == 0:
log("[警告] 所有场景都没有图片,视频将使用纯色背景!")
output_path = os.path.join(workspace, "output_video.mp4")
create_video(workspace, timeline, audio_path, output_path, fps=fps, img_size=img_size,
subtitle=subtitle, log_fn=log_fn)
log(f"[完成] 视频: {output_path}")
return output_path
if __name__ == "__main__":
main()
+20
View File
@@ -0,0 +1,20 @@
from modelscope import snapshot_download
import os
# 1. 定义模型 ID
model_id = 'qwen/Qwen3-ForcedAligner-0.6B'
# 2. 在当前目录下创建 models 文件夹
save_dir = os.path.join(os.path.dirname(__file__), 'models')
# 创建目录(如果不存在)
os.makedirs(save_dir, exist_ok=True)
print(f"下载中... 保存至: {save_dir}")
try:
model_dir = snapshot_download(model_id, cache_dir=save_dir)
print(f"下载完成: {model_dir}")
except Exception as e:
print(f"失败: {e}")
+25
View File
@@ -0,0 +1,25 @@
@echo off
chcp 65001 >nul
cd /d "%~dp0"
:: 清除缓存
del /s /q __pycache__\*.pyc 2>nul
for /d %%d in (__pycache__) do rd /s /q "%%d" 2>nul
echo 正在激活 Videoer 环境并启动 GUI...
call C:\ProgramData\anaconda3\Scripts\activate.bat Videoer
if errorlevel 1 (
echo [错误] 无法激活 Videoer 环境
pause
exit /b 1
)
cd /d "%~dp0"
python "%~dp0gui.py"
if errorlevel 1 (
echo.
echo [错误] 启动失败
pause
)
+135
View File
@@ -0,0 +1,135 @@
"""
scene_generate.py - 场景图生成模块(简化版)
读取 scene_plan.json,对每个 pending 场景用文生图直接生成
不再需要角色元素图、Panel 拼图、img2img
支持断点续传:已生成的自动跳过,失败的可重试
"""
import json
import os
from datetime import datetime
# ========== 路径配置 ==========
WORKSPACE = "workspace/1" # 运行时会被 gui.py 覆盖
PLAN_PATH = os.path.join(WORKSPACE, "scene_plan.json")
SCENE_IMG_DIR = os.path.join(WORKSPACE, "scene")
def load_plan() -> dict:
with open(PLAN_PATH, encoding="utf-8") as f:
return json.load(f)
def save_plan(plan: dict):
with open(PLAN_PATH, "w", encoding="utf-8") as f:
json.dump(plan, f, ensure_ascii=False, indent=2)
def generate_single_scene(
scene: dict,
idx: int,
total: int,
model_name: str,
on_image_generated=None,
) -> dict:
"""
生成单个场景的图片
Args:
scene: 场景信息 dict
idx: 当前索引
total: 总场景数
model_name: 文生图模型名称
on_image_generated: 回调函数 (scene, filepath),用于 GUI 审查
Returns:
更新后的 scene dict
"""
scene_id = scene["scene_id"]
visual_prompt = scene["visual_prompt"]
print(f"\n[场景 {scene_id}] [{idx+1}/{total}]")
print(f" Prompt: {visual_prompt[:80]}...")
os.makedirs(SCENE_IMG_DIR, exist_ok=True)
scene_filename = f"scene_{scene_id:03d}.png"
filepath = os.path.join(SCENE_IMG_DIR, scene_filename)
try:
from image_gen import image_generate
result = image_generate(
prompt=visual_prompt,
save_dir=SCENE_IMG_DIR,
model_name=model_name,
filename=scene_filename,
)
filepath = result["filepath"]
scene["status"] = "generated"
scene["filepath"] = filepath
print(f" [完成] {os.path.basename(filepath)}")
# 如果有回调(GUI 审查),调用它
if on_image_generated:
on_image_generated(scene, filepath)
return scene
except Exception as e:
scene["status"] = "failed"
scene["error"] = str(e)
print(f" [失败] {e}")
return scene
def main(workspace: str = None, model_name: str = "Kolors(便宜快速)"):
"""主流程:生成所有 pending 场景"""
global WORKSPACE, PLAN_PATH, SCENE_IMG_DIR
if workspace:
WORKSPACE = workspace
PLAN_PATH = os.path.join(WORKSPACE, "scene_plan.json")
SCENE_IMG_DIR = os.path.join(WORKSPACE, "scene")
if not os.path.exists(PLAN_PATH):
raise FileNotFoundError(f"未找到 {PLAN_PATH},请先运行场景划分")
plan = load_plan()
scenes = plan["scenes"]
total = len(scenes)
done = [s for s in scenes if s.get("status") == "generated"]
failed = [s for s in scenes if s.get("status") == "failed"]
pending = [s for s in scenes if s.get("status") == "pending"]
print(f"总场景: {total} | 已完成: {len(done)} | 失败: {len(failed)} | 待生成: {len(pending)}")
for idx, scene in enumerate(scenes):
status = scene.get("status")
if status == "generated":
print(f"[{idx+1}/{total}] 场景 {scene['scene_id']} 已完成,跳过")
continue
updated = generate_single_scene(scene, idx, total, model_name)
scenes[idx] = updated
save_plan(plan)
if updated["status"] == "failed":
print(f"[停止] 场景 {scene['scene_id']} 生成失败,再次运行可续传")
break
print(f"进度: {sum(1 for s in scenes if s.get('status') == 'generated')}/{total}")
done_count = sum(1 for s in scenes if s.get("status") == "generated")
print(f"\n========== 完成 ==========")
print(f"已完成: {done_count}/{total}")
if done_count == total:
print("[全部完成] 所有场景图已生成!")
else:
remaining = [s["scene_id"] for s in scenes if s.get("status") != "generated"]
print(f"剩余: {remaining}")
if __name__ == "__main__":
main()
+594
View File
@@ -0,0 +1,594 @@
"""
scene_plan.py - 场景划分模块
AI 读取 article.txt → 划分场景(用于 ASR 对齐)
输出 scene_plan.json
"""
import json
import os
import re
import subprocess
import textwrap
from text_ai import text_ai
# 默认工作区(仅用于独立运行时的测试)
_DEFAULT_WORKSPACE = "workspace/1"
# ========== 场景划分 ==========
PLANNER_SYSTEM = """You are an expert cinematic storyboard director and visual prompt engineer. Your task is to transform written articles into detailed, production-ready visual scenes for AI image generation.
## YOUR MISSION
Create richly detailed, cinematic visual prompts that bring the article to life through compelling imagery. Each scene should be a complete visual blueprint that an AI image generator can use to create stunning, professional-quality images.
---
## STEP 1 — ANALYZE THE ARTICLE TYPE
Classify the article into ONE of these categories:
**NARRATIVE** (stories, novels, scripts, personal anecdotes)
- Has characters with dialogue, emotional arcs, plot progression
- Requires character consistency across scenes
**KNOWLEDGE** (essays, science explainers, book reviews, historical analysis, philosophy)
- Abstract concepts, theories, ideas without specific characters
- Needs metaphorical and symbolic visualization
**TUTORIAL** (how-to guides, technical instructions, step-by-step processes)
- Procedural content with clear steps
- Focus on demonstrations and UI/screen elements
---
## STEP 2 — SCENE GRANULARITY STRATEGY
### FOR NARRATIVE ARTICLES:
Create **fine-grained scenes** (5-15 seconds each):
- One scene per meaningful action beat or emotional shift
- Dialogue exchanges: new scene when speaker changes or mood shifts
- Camera changes: close-up → wide shot = separate scenes
- Time/location jumps = new scene
### FOR KNOWLEDGE ARTICLES:
Create **moderate scenes** (8-20 seconds each):
- One scene per key concept or paragraph
- Don't over-merge: each distinct visual idea gets its own scene
- Use varied visualization techniques (see below)
### FOR TUTORIAL ARTICLES:
Create **step-based scenes** (5-12 seconds each):
- One scene per instruction step or sub-step
- Include before/after states if applicable
---
## STEP 3 — VISUAL PROMPT ENGINEERING (CRITICAL)
Your visual_prompt must be **DETAILED, SPECIFIC, and PRODUCTION-READY**. Follow this structure:
### MANDATORY ELEMENTS (in order):
**1. SHOT TYPE & CAMERA ANGLE** (choose specifically):
- Extreme Close-Up (ECU): eyes, hands, small objects
- Close-Up (CU): face, upper body
- Medium Shot (MS): waist up, two people conversing
- Full Shot (FS): entire body, room context
- Wide Shot (WS): landscape, establishing shot
- Bird's Eye View: overhead perspective
- Dutch Angle: tilted camera for tension
**2. SUBJECT DESCRIPTION** (be extremely specific):
FOR HUMANS (NEVER use names, always descriptive labels):
- Age range: "a woman in her late 20s"
- Ethnicity/Skin tone: "with warm olive skin" / "fair-skinned" / "deep brown skin"
- Hair: "shoulder-length wavy black hair with subtle highlights"
- Build: "slender build" / "athletic frame"
- Clothing (specific): "wearing a cream-colored cashmere turtleneck sweater"
- Expression: "with a contemplative, slightly melancholic expression"
- Pose/Action: "gazing thoughtfully out a rain-streaked window"
FIRST APPEARANCE: Define ALL physical traits completely
SUBSEQUENT APPEARANCES: Reuse EXACT same description for consistency
FOR OBJECTS/SETTINGS:
- Material: "polished mahogany desk" / "weathered stone walls"
- Condition: "vintage leather-bound books with gold embossing"
- Arrangement: "arranged in neat stacks" / "scattered haphazardly"
**3. ENVIRONMENT/BACKGROUND** (layer the details):
- Immediate setting: "in a cozy study room"
- Background elements: "floor-to-ceiling bookshelves filled with ancient tomes"
- Depth cues: "soft-focus background showing a fireplace with flickering flames"
- Weather/Time: "on a misty autumn morning" / "during golden hour sunset"
**4. LIGHTING & ATMOSPHERE** (create mood):
- Light source: "warm lamplight casting long shadows"
- Quality: "soft diffused natural light from a large window"
- Color temperature: "cool blue moonlight" / "warm amber candlelight"
- Atmospheric effects: "dust motes dancing in sunbeams" / "gentle fog rolling across the scene"
- Mood keywords: "serene and contemplative" / "tense and dramatic" / "nostalgic and dreamy"
**5. COMPOSITION & STYLE** (guide the aesthetic):
- Rule of thirds: "subject positioned off-center following rule of thirds"
- Leading lines: "perspective lines converging toward the subject"
- Color palette: "muted earth tones with pops of burgundy" / "monochromatic blues"
- Artistic style reference: "cinematic photography style reminiscent of Roger Deakins" / "painterly quality inspired by Edward Hopper" / "clean minimalist aesthetic"
- Depth of field: "shallow depth of field with creamy bokeh" / "deep focus throughout"
**6. TECHNICAL QUALITIES** (ensure image quality):
- Resolution hint: "ultra-high detail, 8k quality"
- Texture emphasis: "rich textures visible in fabric and wood grain"
- Sharpness: "razor-sharp focus on subject's eyes"
---
## VISUALIZATION TECHNIQUES FOR ABSTRACT CONCEPTS (Knowledge Articles)
When the text discusses abstract ideas, use these strategies:
**METAPHORICAL IMAGERY:**
- "Cultural bridge""an elegant stone bridge spanning a misty valley, connecting two distinct architectural styles, symbolizing East meets West"
- "Knowledge expansion""an open ancient book radiating golden light, with luminous particles forming constellations above it"
**HISTORICAL RECONSTRUCTION:**
- "ancient Chinese philosophy""a wise scholar in flowing Han dynasty robes sitting cross-legged on a bamboo mat, surrounded by scrolls, soft morning light filtering through paper windows"
**SYMBOLIC COMPOSITIONS:**
- "technological progress""a vintage pocket watch gradually transforming into a sleek smartwatch, gears and circuits merging, dramatic side lighting"
**PERSONIFICATION:**
- "artificial intelligence""a humanoid figure made of translucent glass and glowing neural networks, standing in a futuristic laboratory, blue and purple ambient lighting"
**DATA VISUALIZATION AS ART:**
- "statistical trends""elegant 3D bar charts rising like crystal structures from a reflective surface, bathed in gradient lighting from blue to orange"
---
## STEP 4 — CRITICAL RULES
1. **COMPLETE COVERAGE**: Every sentence in the article must appear in exactly one scene. No skipping.
2. **EXACT TEXT EXCERPTS**: The "text" field must contain VERBATIM quotes from the article. Do NOT paraphrase or summarize.
3. **LANGUAGE CONSISTENCY**:
- If article is in Chinese → write background AND visual_prompt in Chinese
- If article is in English → write both in English
- Maintain the article's language throughout
4. **PROMPT LENGTH & DETAIL**:
- Minimum: 60 words
- Ideal: 80-150 words
- Maximum: 200 words
- More detail = better image generation results
5. **AVOID THESE MISTAKES**:
❌ Vague: "a person thinking"
✅ Specific: "a young Asian woman with shoulder-length dark hair, wearing glasses and a white blouse, resting her chin on her hand while gazing thoughtfully at a laptop screen, soft afternoon light from a nearby window"
❌ Generic: "a beautiful landscape"
✅ Specific: "a sweeping mountain vista at sunrise, snow-capped peaks glowing pink and orange in the first light, wispy clouds drifting through valleys below, crisp alpine air suggested by sharp clarity"
6. **UNIQUENESS**: Each scene's text excerpt must be unique. No overlaps or duplicates.
7. **NEGATIVE PROMPT AVOIDANCE**: Do NOT include in visual_prompt:
- Text, letters, words, captions, labels, watermarks, logos
- Blurry, low quality, deformed elements
- Multiple conflicting perspectives in one scene
---
## OUTPUT FORMAT
Return ONLY valid JSON. No markdown formatting. No explanations. No code blocks.
{
"scenes": [
{
"scene_id": 1,
"text": "Exact quote from the article covering this scene...",
"background": "Detailed description of the setting, environment, and context in the article's language...",
"visual_prompt": "[SHOT TYPE] + [SUBJECT] + [ENVIRONMENT] + [LIGHTING] + [COMPOSITION] + [STYLE]. Rich, detailed, production-ready prompt in the article's language, 80-150 words...",
"status": "pending"
},
{
"scene_id": 2,
...
}
]
}
---
## EXAMPLES OF HIGH-QUALITY VISUAL PROMPTS
### Example 1 (Narrative - Chinese):
"visual_prompt": "中景镜头。一位三十岁左右的亚洲女性,皮肤白皙,留着齐肩的黑色直发,穿着米色羊绒毛衣和深灰色长裤,坐在咖啡馆靠窗的位置。她双手捧着一杯冒着热气的拿铁咖啡,眼神略带忧郁地凝视着窗外淅沥的雨滴。背景是模糊的咖啡馆内部,暖黄色的吊灯和木质桌椅营造出温馨的氛围。柔和的自然光透过布满雨珠的玻璃窗洒入,在她的侧脸投下温柔的阴影。电影感摄影风格,浅景深,温暖的色调,充满沉思和怀旧的情绪。"
### Example 2 (Knowledge - English):
"visual_prompt": "Wide establishing shot. An ancient library interior with towering floor-to-ceiling oak bookshelves filled with leather-bound volumes dating back centuries. A grand wooden reading table sits in the center, illuminated by a single ornate brass lamp casting warm golden light. Dust motes dance in the atmospheric light beams streaming through tall arched windows. The perspective uses leading lines from the bookshelf aisles converging toward a distant figure of a scholar in Renaissance-era robes. Cinematic photography with deep shadows and rich amber tones, reminiscent of classical paintings by Rembrandt. Ultra-detailed, showcasing intricate wood carvings and the texture of aged parchment."
### Example 3 (Tutorial - Chinese):
"visual_prompt": "特写镜头。一双修长的手正在操作一台银色的笔记本电脑,屏幕上显示着Python代码编辑器界面,代码清晰可见。手指悬停在键盘上方,准备敲击回车键。桌面整洁,旁边放着一杯绿茶和一个打开的笔记本,上面有手写的流程图。明亮的白色台灯光线从左侧照射,营造专注的工作氛围。现代简约风格,高清晰度,强调屏幕上的代码细节和手指的动作瞬间。冷色调为主,点缀温暖的木质桌面纹理。"
---
NOW, analyze the provided article and create your detailed scene breakdown. Remember: MORE DETAIL = BETTER IMAGES. Be specific, be cinematic, be creative.
"""
def _fix_json_text(text: str) -> str:
"""尝试修复 LLM 返回的常见 JSON 格式问题"""
# 找第一个 { 到最后一个 } 之间的内容
start = text.find("{")
end = text.rfind("}")
if start == -1 or end == -1:
return text
text = text[start:end+1]
# 修复:中文引号 "" '' 替换为转义
text = text.replace("\u201c", '\\"').replace("\u201d", '\\"')
text = text.replace("\u2018", "'").replace("\u2019", "'")
# 修复:把 \n 换行符字面文本变成真正的换行(LLM 有时输出 \\n)
text = text.replace("\\n", "\n")
# 修复:控制字符(除了换行和制表)
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
return text
def _extract_scenes_fallback(text: str) -> list:
"""
主力解析方案:逐个提取 scene 块。
先尝试 json.loads,失败就用正则暴力提取。
"""
scenes = []
# 匹配 "scene_id" 字段所在的 { } 块,兼容多余引号如 "scene_id": 23"
pattern = r'\{\s*"scene_id"\s*:\s*"?(\d+)"?\s*[,\s]'
pos = 0
while True:
m = re.search(pattern, text[pos:])
if not m:
break
block_start = pos + m.start()
# 从这个 { 开始找匹配的 }
depth = 0
i = block_start
while i < len(text):
if text[i] == '{':
depth += 1
elif text[i] == '}':
depth -= 1
if depth == 0:
block = text[block_start:i+1]
# 清理控制字符
block = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', block)
# 先尝试 json.loads
try:
scene = json.loads(block)
if isinstance(scene, dict) and "scene_id" in scene:
scenes.append(scene)
except json.JSONDecodeError:
# json.loads 失败,暴力提取
scene = _brute_extract_scene(block)
if scene:
scenes.append(scene)
break
i += 1
pos = block_start + 1
return scenes
def _brute_extract_scene(block: str) -> dict | None:
"""暴力从一段文本中提取 scene 字段,不依赖 json.loads"""
scene = {}
# scene_id:兼容多余引号
m_id = re.search(r'"scene_id"\s*:\s*"?(\d+)"?', block)
if m_id:
scene["scene_id"] = int(m_id.group(1))
else:
return None
# text:可能有中文内容,匹配到下一个字段前
m_text = re.search(r'"text"\s*:\s*"(.+?)"\s*(?:,|\})', block, re.DOTALL)
if m_text:
scene["text"] = m_text.group(1).strip()
# 兼容旧字段 lines
if "text" not in scene:
m_lines = re.search(r'"lines"\s*:\s*"(.+?)"\s*(?:,|\})', block, re.DOTALL)
if m_lines:
scene["text"] = m_lines.group(1).strip()
# background:可能有中文引号 "" 嵌套
m_bg = re.search(r'"background"\s*:\s*"(.+?)"\s*,\s*"visual_prompt"', block, re.DOTALL)
if m_bg:
scene["background"] = m_bg.group(1).strip()
else:
# 兜底:匹配到最后一个 " 之前
m_bg2 = re.search(r'"background"\s*:\s*"(.+?)"\s*(?:,|\})', block, re.DOTALL)
if m_bg2:
scene["background"] = m_bg2.group(1).strip()
# visual_prompt
m_prompt = re.search(r'"visual_prompt"\s*:\s*"(.+?)"\s*,\s*"status"', block, re.DOTALL)
if m_prompt:
scene["visual_prompt"] = m_prompt.group(1).strip()
else:
m_prompt = re.search(r'"visual_prompt"\s*:\s*"(.+)"\s*\}', block, re.DOTALL)
if m_prompt:
scene["visual_prompt"] = m_prompt.group(1).strip()
else:
scene["visual_prompt"] = "No visual prompt"
if "text" not in scene:
scene["text"] = ""
scene["status"] = "pending"
return scene
def _get_audio_duration(audio_path: str) -> float | None:
"""Get audio duration in seconds. Tries ffprobe first, falls back to mutagen."""
if not audio_path or not os.path.exists(audio_path):
print(f"[DEBUG] audio not found: {audio_path}")
return None
# Method 1: ffprobe
try:
result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "csv=p=0", audio_path],
capture_output=True, text=True, timeout=10,
creationflags=subprocess.CREATE_NO_WINDOW if os.name == "nt" else 0,
)
raw = result.stdout.strip()
if raw:
return float(raw)
except FileNotFoundError:
print("[DEBUG] ffprobe not found, trying mutagen...")
except Exception as e:
print(f"[DEBUG] ffprobe error: {e}")
# Method 2: mutagen (pure Python, no external binary)
try:
from mutagen.mp3 import MP3
audio = MP3(audio_path)
return audio.info.length
except ImportError:
print("[DEBUG] mutagen not installed. Run: pip install mutagen")
except Exception as e:
print(f"[DEBUG] mutagen error: {e}")
return None
def plan_scenes(article_text: str, workspace: str = None, provider: str = None, user_note: str = None) -> dict:
"""
调用 AI 划分场景
Args:
article_text: 文章文本
workspace: 工作区路径(包含 article.txt 和 voice.mp3
provider: LLM 提供商名称
user_note: 用户添加的备注(可选)
"""
if workspace is None:
workspace = _DEFAULT_WORKSPACE
# 获取音频时长
audio_path = os.path.join(workspace, "voice.mp3")
duration = _get_audio_duration(audio_path)
duration_hint = ""
if duration and duration > 0:
minutes = int(duration // 60)
seconds = int(duration % 60)
min_scenes = max(5, int(duration / 60 * 5)) # ~5 scenes/min
max_scenes = int(duration / 60 * 16) # ~16 scenes/min
duration_hint = f"""
IMPORTANT — AUDIO DURATION CONSTRAINT:
The voiceover audio is exactly {minutes}m {seconds}s ({int(duration)} seconds).
Recommended scene count: {min_scenes} to {max_scenes} (each scene ~5-15 seconds).
This is a recommendation for pacing. You may adjust based on the article's content."""
# 构建用户备注部分
user_note_section = ""
if user_note and user_note.strip():
user_note_section = f"""
---
## USER'S SPECIAL INSTRUCTIONS
Please pay attention to the following special requirements from the user:
{user_note.strip()}
Incorporate these instructions into your scene planning and visual prompt generation.
---
"""
prompt = f"""## ARTICLE TO CONVERT
Here is the complete article text that you need to divide into visual scenes:
{textwrap.dedent(article_text)}
---
## YOUR TASK
Analyze this article thoroughly and create a detailed scene-by-scene visual breakdown following the guidelines in the system prompt.
**Key Requirements:**
1. **Cover every sentence** - Do not skip any content
2. **Be extremely detailed** in visual prompts (80-150 words each)
3. **Use cinematic language** - shot types, lighting, composition, mood
4. **Maintain consistency** - characters look the same across scenes
5. **Match the article's language** - if Chinese, write prompts in Chinese
{duration_hint}
{user_note_section}
**Remember:** Your visual prompts will be used directly by AI image generators. The more specific and detailed you are, the better the final images will be. Think like a film director planning each shot.
Now, create your scene breakdown with rich, production-ready visual prompts.
"""
print("=" * 60)
print(f"[PROMPT] system prompt: {len(PLANNER_SYSTEM)} chars")
print(f"[PROMPT] user prompt: {len(prompt)} chars")
print("=" * 60)
response = text_ai(prompt, PLANNER_SYSTEM, provider=provider)
if response is None:
raise ValueError("text_ai 返回了 None,请检查 LLM 配置和网络连接")
# 保存原始 LLM 返回供调试
plan_path = os.path.join(workspace, "scene_plan.json")
debug_path = os.path.join(workspace, "_debug_response.txt")
try:
with open(debug_path, "w", encoding="utf-8") as f:
f.write(response)
print(f"[DEBUG] 原始返回已保存到 {debug_path}")
except Exception:
pass
print(f"[DEBUG] LLM response length: {len(response)} chars")
# 清理可能的 markdown 包裹
text = response.strip()
if text.startswith("```"):
lines = text.split("```")
text = lines[1] if len(lines) > 1 else lines[0]
if text.startswith("json"):
text = text[4:]
text = text.strip()
# 解析 JSON:先尝试 json.loads(快速通道),失败就暴力提取(主力方案)
data = None
scenes = None
for attempt, raw in enumerate([text, _fix_json_text(text)]):
try:
data = json.loads(raw)
break
except json.JSONDecodeError as e:
print(f"[DEBUG] JSON parse attempt {attempt+1} failed: {e.msg} at line {e.lineno} col {e.colno}")
if data is None:
# 暴力逐块提取
scenes = _extract_scenes_fallback(text)
if scenes:
print(f"[RECOVERY] JSON 整体解析失败,暴力提取到 {len(scenes)} 个场景")
# 从解析结果中提取 scenes 列表
if scenes is not None:
pass # 已经有 scenes 了
elif data is not None:
if isinstance(data, list):
scenes = data
elif "scenes" in data:
scenes = data["scenes"]
else:
for v in data.values():
if isinstance(v, list) and v and "scene_id" in v[0]:
scenes = v
break
if not scenes:
raise ValueError("AI 返回内容无法解析为场景数据。请查看工作区下的 _debug_response.txt")
return {"scenes": scenes}
def main(workspace: str = None, provider: str = None, user_note: str = None):
"""
执行场景划分
Args:
workspace: 工作区路径(包含 article.txt
provider: LLM 提供商名称
user_note: 用户添加的备注(可选)
"""
if workspace is None:
workspace = _DEFAULT_WORKSPACE
plan_path = os.path.join(workspace, "scene_plan.json")
article_path = os.path.join(workspace, "article.txt")
print("[场景划分] 开始...")
print(f" 工作区: {workspace}")
# 1. 读取 article(原文,不编号)
with open(article_path, encoding="utf-8") as f:
article_raw = f.read().strip()
# 过滤无意义行
article_lines = [l.strip() for l in article_raw.split("\n") if l.strip() and l.strip() != "Identifying the speaker"]
article_text = "\n".join(article_lines)
print(f" Article 长度: {len(article_text)} 字符")
# 2. AI 划分场景
plan = plan_scenes(article_text, workspace=workspace, provider=provider, user_note=user_note)
scenes = plan["scenes"]
# 3. 确保每个 scene 有必要字段
for i, scene in enumerate(scenes):
if "status" not in scene:
scene["status"] = "pending"
if "scene_id" not in scene:
scene["scene_id"] = i + 1
else:
scene["scene_id"] = int(scene["scene_id"])
if "visual_prompt" not in scene:
scene["visual_prompt"] = scene.get("background") or scene.get("description") or "No visual prompt"
if "text" not in scene:
# 兼容旧字段 lines
scene["text"] = scene.get("lines", "")
if "background" not in scene:
scene["background"] = ""
# 清理旧字段
scene.pop("lines", None)
# 4. 保存计划
output = {
"total_scenes": len(scenes),
"scenes": scenes
}
with open(plan_path, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"\n[场景划分] 完成,共 {len(scenes)} 个场景,已保存到 {plan_path}")
print("\n" + "="*80)
for s in scenes:
scene_id = s['scene_id']
text_preview = (s.get("text", "") or "").replace('\n', ' ')[:50]
prompt_preview = s.get('visual_prompt', '')[:80]
bg_preview = s.get('background', '')[:50]
print(f"[SCENE {scene_id:2d}]")
print(f" TEXT: {text_preview}...")
print(f" PROMPT: {prompt_preview}...")
print(f" BG: {bg_preview}...")
print("\n" + "="*80)
# 统计信息
total_prompt_words = sum(len(s.get('visual_prompt', '').split()) for s in scenes)
avg_prompt_len = total_prompt_words / len(scenes) if scenes else 0
print(f"\n[STATS]")
print(f" AVG PROMPT LENGTH: {avg_prompt_len:.0f} words/scene")
print(f" TOTAL SCENES: {len(scenes)}")
if avg_prompt_len < 50:
print(f" [WARN] Prompts are short, consider more detail for better image quality")
elif avg_prompt_len >= 80:
print(f" [OK] Prompt detail level is good")
return plan
if __name__ == "__main__":
main()
+113
View File
@@ -0,0 +1,113 @@
"""
text_ai.py - LLM 文本生成
用于场景划分等 AI 推理任务
支持多 LLM 提供商切换
"""
from openai import OpenAI
from config import LLM_PROVIDERS, DEFAULT_LLM, LLM_API_KEY, LLM_API_BASE, LLM_MODEL
def text_ai(in_put: str, system_prompt: str = "You are a helpful assistant.",
provider: str = None) -> str:
"""
调用 LLM 生成文本
Args:
in_put: 用户输入内容
system_prompt: 系统提示词
provider: LLM 提供商名称(对应 LLM_PROVIDERS 的 key),None 则用默认
Returns:
AI 生成的文本
"""
if provider and provider in LLM_PROVIDERS:
cfg = LLM_PROVIDERS[provider]
api_key = cfg["api_key"]
api_base = cfg["api_base"]
model = cfg["model"]
else:
api_key = LLM_API_KEY
api_base = LLM_API_BASE
model = LLM_MODEL
client = OpenAI(
api_key=api_key,
base_url=api_base,
)
# ModelScope 的 Qwen3 系列和 GLM 系列默认开启 thinking,需要关掉
# 注意:MiniMax 系列不是 Qwen/GLM,不需要也不能传 enable_thinking
extra_body = {}
is_modelscope = "modelscope" in api_base.lower()
is_qwen = "qwen" in model.lower()
is_glm = "glm" in model.lower() or "zhipuai" in model.lower()
if is_modelscope and (is_qwen or is_glm):
extra_body["enable_thinking"] = False
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": in_put}
],
max_tokens=16384,
stream=False,
extra_body=extra_body if extra_body else None,
)
# 防御:choices 为空或 None
if not response.choices:
# 尝试从 response 对象提取有用信息
resp_dict = response.model_dump() if hasattr(response, "model_dump") else {}
error_msg = resp_dict.get("error", {})
if isinstance(error_msg, dict):
err_text = error_msg.get("message", str(error_msg))
else:
err_text = str(resp_dict)
raise ValueError(
f"模型 '{model}' 返回了空的 choices。\n"
f"响应内容: {err_text}\n"
f"可能是模型暂时不可用或请求被拒绝。"
)
msg = response.choices[0].message
content = msg.content
# 检测输出是否被截断
finish = response.choices[0].finish_reason
if finish == "length":
print(f"[WARN] LLM output truncated (finish_reason=length), max_tokens may be too small")
if content is None:
# fallback:尝试多种字段名(不同 API 叫法不同)
for attr in ("thinking_content", "reasoning_content", "text", "output"):
fallback = getattr(msg, attr, None)
if fallback:
content = fallback
break
if content is None:
# 最后一搏:尝试把 message 对象当 dict 看
try:
msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else vars(msg)
for v in msg_dict.values():
if isinstance(v, str) and v.strip():
content = v
break
except Exception:
pass
if content is None:
finish = response.choices[0].finish_reason
raise ValueError(
f"模型 '{model}' 返回内容为空(content=None),"
f"finish_reason={finish}\n"
f"如果使用 MiniMax 系列,请改用 Qwen3.5-35B (ModelScope 免费) 或其他 Qwen 模型。"
)
return content
if __name__ == "__main__":
result = text_ai("Hello, say hi in one sentence.")
print(result)