长任务启动实战手册
长任务启动实战手册
这是一份可操作的实战指南,帮助你在有想法后快速启动并可靠完成长任务
快速导航
一、一分钟启动模板
方案 A: 简单文件模式 (推荐入门)
# 1. 创建任务目录
mkdir -p my-task/{state,logs,artifacts}
# 2. 创建任务配置
cat > my-task/task_config.json << 'EOF'
{
"task_id": "my-first-task",
"goal": "构建一个完整的用户认证系统",
"steps": [
{"id": "design", "name": "设计数据库模型", "status": "pending"},
{"id": "impl", "name": "实现认证逻辑", "status": "pending"},
{"id": "test", "name": "编写测试用例", "status": "pending"},
{"id": "doc", "name": "编写文档", "status": "pending"}
]
}
EOF
# 3. 创建进度文件
cat > my-task/state/progress.md << 'EOF'
# 任务进度
## 目标
构建一个完整的用户认证系统
## 当前状态
- [ ] 设计数据库模型
- [ ] 实现认证逻辑
- [ ] 编写测试用例
- [ ] 编写文档
## 备注
(在这里记录重要决策和问题)
EOF
# 4. 创建启动脚本
cat > my-task/init.sh << 'EOF'
#!/bin/bash
echo "🚀 初始化任务环境..."
# 在这里添加环境初始化命令
echo "✅ 环境就绪"
EOF
chmod +x my-task/init.sh
方案 B: Python 脚本模式 (推荐进阶)
# save as: run_task.py
#!/usr/bin/env python3
"""长任务执行器 - 最小可行版本"""
import json
import time
from pathlib import Path
from datetime import datetime
class LongTask:
def __init__(self, task_dir: str):
self.task_dir = Path(task_dir)
self.state_file = self.task_dir / "state" / "task_state.json"
self.log_file = self.task_dir / "logs" / "task.log"
self.progress_file = self.task_dir / "state" / "progress.md"
# 确保目录存在
for d in ["state", "logs", "artifacts"]:
(self.task_dir / d).mkdir(parents=True, exist_ok=True)
def load_state(self) -> dict:
if self.state_file.exists():
return json.loads(self.state_file.read_text())
return {"steps": [], "completed": [], "current": None}
def save_state(self, state: dict):
state["updated_at"] = datetime.now().isoformat()
self.state_file.write_text(json.dumps(state, indent=2))
def log(self, message: str, level: str = "INFO"):
entry = {
"timestamp": datetime.now().isoformat(),
"level": level,
"message": message
}
with open(self.log_file, 'a') as f:
f.write(json.dumps(entry) + '\n')
print(f"[{level}] {message}")
def get_next_step(self, state: dict) -> dict | None:
for step in state.get("steps", []):
if step.get("status") == "pending":
return step
return None
def run(self):
"""运行任务"""
self.log("任务启动")
state = self.load_state()
while True:
step = self.get_next_step(state)
if not step:
self.log("所有步骤完成!")
break
self.log(f"开始执行: {step['name']}")
step["status"] = "running"
self.save_state(state)
try:
# 这里执行实际的任务逻辑
result = self.execute_step(step)
step["status"] = "completed"
step["result"] = result
self.log(f"步骤完成: {step['name']}")
except Exception as e:
step["status"] = "failed"
step["error"] = str(e)
self.log(f"步骤失败: {step['name']} - {e}", "ERROR")
# 保存状态后退出
self.save_state(state)
raise
self.save_state(state)
self.log("任务完成")
def execute_step(self, step: dict) -> dict:
"""执行单个步骤 - 由子类实现"""
# 示例: 模拟工作
time.sleep(1)
return {"status": "done"}
if __name__ == "__main__":
import sys
task_dir = sys.argv[1] if len(sys.argv) > 1 else "my-task"
task = LongTask(task_dir)
task.run()
二、五步启动流程
Step 1: 定义目标
## 任务目标模板
**一句话描述**: _______________________
**成功标准** (怎么算完成):
1. _______________________
2. _______________________
3. _______________________
**交付物** (产出什么):
- [ ] _______________________
- [ ] _______________________
**约束条件** (有什么限制):
- 时间: _______________________
- 资源: _______________________
- 技术: _______________________
Step 2: 分解步骤
## 步骤分解原则
✅ 好的步骤:
- 独立可测试
- 15-60 分钟可完成
- 有明确的完成标准
- 失败后可独立重试
❌ 不好的步骤:
- "完成整个项目"
- "写代码" (太模糊)
- "优化性能" (没有量化标准)
## 步骤清单模板
| # | 步骤名 | 预计时间 | 依赖 | 验证方法 |
|---|--------|----------|------|----------|
| 1 | | | - | |
| 2 | | | 1 | |
| 3 | | | 1 | |
| 4 | | | 2,3 | |
Step 3: 设置状态管理
// task_state.json 模板
{
"task_id": "unique-task-id",
"goal": "任务目标描述",
"created_at": "2026-03-16T10:00:00",
"updated_at": "2026-03-16T10:00:00",
"status": "running",
"steps": [
{
"id": "step-1",
"name": "步骤名称",
"status": "pending",
"dependencies": [],
"started_at": null,
"completed_at": null,
"result": null,
"error": null
}
],
"context": {
"key_decisions": [],
"notes": ""
},
"checkpoints": []
}
Step 4: 配置错误处理
# retry_config.json
{
"max_retries": 3,
"base_delay": 1.0,
"max_delay": 60.0,
"jitter": true,
"error_handlers": {
"timeout": {"action": "retry", "delay_multiplier": 2},
"rate_limit": {"action": "wait", "fixed_delay": 60},
"auth_error": {"action": "fail", "notify": true},
"unknown": {"action": "retry", "max_attempts": 2}
}
}
Step 5: 启动监控
# 终端 1: 运行任务
python run_task.py my-task
# 终端 2: 监控进度
watch -n 5 'cat my-task/state/task_state.json | python -m json.tool'
# 终端 3: 查看日志
tail -f my-task/logs/task.log | python -c "
import sys, json
for line in sys.stdin:
try:
entry = json.loads(line)
print(f\"[{entry['level']:5}] {entry['message']}\")
except:
print(line.strip())
"
三、常用命令速查
状态查看
# 查看任务状态
cat my-task/state/task_state.json | python -m json.tool
# 查看进度
cat my-task/state/progress.md
# 查看最近日志
tail -20 my-task/logs/task.log
# 查看错误日志
grep '"level": "ERROR"' my-task/logs/task.log
# 统计完成进度
cat my-task/state/task_state.json | python -c "
import sys, json
data = json.load(sys.stdin)
steps = data.get('steps', [])
completed = sum(1 for s in steps if s.get('status') == 'completed')
total = len(steps)
print(f'进度: {completed}/{total} ({completed/total*100:.1f}%)')
"
恢复与重试
# 重试失败的步骤
cat my-task/state/task_state.json | python -c "
import sys, json
data = json.load(sys.stdin)
for step in data.get('steps', []):
if step.get('status') == 'failed':
step['status'] = 'pending'
step['error'] = None
print(json.dumps(data, indent=2))
" > my-task/state/task_state.json.tmp
mv my-task/state/task_state.json.tmp my-task/state/task_state.json
# 从检查点恢复
# (找到最近的 checkpoint 文件)
ls -lt my-task/state/checkpoints/
cp my-task/state/checkpoints/ckpt_xxx.json my-task/state/task_state.json
清理与重置
# 清理日志
> my-task/logs/task.log
# 重置任务状态
cat > my-task/state/task_state.json << 'EOF'
{"steps": [], "status": "pending"}
EOF
# 备份当前状态
cp -r my-task/state my-task/state.backup.$(date +%Y%m%d_%H%M%S)
四、问题排查指南
问题 1: 任务卡住不动
症状: 日志没有新输出,进程还在运行
排查步骤:
# 1. 检查进程状态
ps aux | grep python
# 2. 检查是否有僵尸进程
ps aux | grep -E 'defunct|zombie'
# 3. 检查资源使用
top -pid $(pgrep -f "run_task")
# 4. 检查网络连接
lsof -i -P | grep $(pgrep -f "run_task")
解决方案: - 如果是网络阻塞,检查超时设置 - 如果是死锁,重启任务 - 添加心跳检测机制
问题 2: 内存持续增长
排查步骤:
# 监控内存
watch -n 1 'ps -o pid,rss,vsz,comm -p $(pgrep -f "run_task")'
# 生成内存快照
python -m memory_profiler run_task.py my-task
解决方案: - 定期清理不再需要的数据 - 使用生成器而非列表 - 检查是否有循环引用
问题 3: 状态文件损坏
症状: JSON 解析失败
排查步骤:
# 验证 JSON 格式
python -m json.tool my-task/state/task_state.json
# 如果失败,尝试修复
python -c "
import json
try:
with open('my-task/state/task_state.json') as f:
data = json.load(f)
except json.JSONDecodeError as e:
print(f'错误位置: 行 {e.lineno}, 列 {e.colno}')
print(f'错误内容: {e.msg}')
"
解决方案: - 从检查点恢复 - 手动修复 JSON - 重新开始任务
问题 4: API 调用频繁失败
排查步骤:
# 检查错误率
grep -c '"level": "ERROR"' my-task/logs/task.log
grep -c 'rate.limit' my-task/logs/task.log
# 检查网络延迟
ping -c 10 api.example.com
解决方案: - 降低请求频率 - 增加重试间隔 - 实现断路器
五、实战场景示例
场景 A: 构建 Web 应用
{
"task_id": "build-web-app",
"goal": "构建一个待办事项 Web 应用",
"steps": [
{
"id": "setup",
"name": "项目初始化",
"status": "pending",
"commands": ["npm init -y", "npm install express"],
"verify": "npm list express"
},
{
"id": "db",
"name": "数据库设计",
"status": "pending",
"files": ["schema.sql"],
"verify": "sqlite3 todo.db '.schema'"
},
{
"id": "api",
"name": "API 开发",
"status": "pending",
"files": ["routes.js", "controllers.js"],
"verify": "npm test"
},
{
"id": "frontend",
"name": "前端开发",
"status": "pending",
"files": ["public/index.html", "public/app.js"],
"verify": "curl localhost:3000"
},
{
"id": "deploy",
"name": "部署上线",
"status": "pending",
"verify": "curl https://myapp.example.com/health"
}
]
}
场景 B: 数据分析任务
{
"task_id": "data-analysis",
"goal": "分析用户行为数据",
"steps": [
{
"id": "extract",
"name": "数据提取",
"status": "pending",
"input": "raw_data.csv",
"output": "extracted.json"
},
{
"id": "clean",
"name": "数据清洗",
"status": "pending",
"input": "extracted.json",
"output": "cleaned.json"
},
{
"id": "analyze",
"name": "数据分析",
"status": "pending",
"input": "cleaned.json",
"output": "analysis_results.json"
},
{
"id": "visualize",
"name": "可视化",
"status": "pending",
"input": "analysis_results.json",
"output": "charts/"
},
{
"id": "report",
"name": "生成报告",
"status": "pending",
"input": "analysis_results.json",
"output": "report.pdf"
}
]
}
场景 C: 代码重构
{
"task_id": "refactor-auth",
"goal": "重构认证模块",
"steps": [
{
"id": "analyze",
"name": "分析现有代码",
"status": "pending",
"output": "analysis.md"
},
{
"id": "design",
"name": "设计新架构",
"status": "pending",
"output": "new_design.md"
},
{
"id": "implement",
"name": "实现新代码",
"status": "pending",
"files": ["auth/new_module.py"]
},
{
"id": "migrate",
"name": "迁移数据",
"status": "pending",
"script": "migrate.py"
},
{
"id": "test",
"name": "运行测试",
"status": "pending",
"commands": ["pytest tests/auth/"]
}
]
}
六、Checklist: 启动前确认
□ 任务目标已明确记录
□ 步骤已分解到 15-60 分钟粒度
□ 每个步骤有验证方法
□ 状态文件已创建
□ 日志目录已创建
□ 错误处理策略已确定
□ 检查点保存逻辑已实现
□ 监控方式已准备
□ 恢复流程已测试
手册版本: 1.0 最后更新: 2026-03-16