feat: mvp
This commit is contained in:
1
.env.example
Normal file
1
.env.example
Normal file
@@ -0,0 +1 @@
|
||||
GEMINI_API_KEY=<GEMINI_API_KEY>
|
||||
18
.gitignore
vendored
Normal file
18
.gitignore
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
.env
|
||||
|
||||
# Runtime
|
||||
models
|
||||
projects
|
||||
|
||||
# IDE
|
||||
.idea
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.13
|
||||
67
README.md
Normal file
67
README.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# Fucking Chinese Dramas dubbing pipeline
|
||||
|
||||
dubs fucking chinese drama movies to Russian/English using AI.
|
||||
|
||||
can translate 2-hour movie in ~10 mins
|
||||
|
||||
## requirements
|
||||
|
||||
- uv
|
||||
- ffmpeg (for audio extraction and video muxing)
|
||||
- macOS with Apple Silicon (MPS) or Linux/Windows (CPU) (not tested)
|
||||
|
||||
## installation
|
||||
|
||||
```bash
|
||||
uv sync
|
||||
```
|
||||
|
||||
all models (SenseVoice, Silero) download automatically on first run.
|
||||
|
||||
## configuration
|
||||
|
||||
create a `.env` file in the project root:
|
||||
|
||||
```
|
||||
GEMINI_API_KEY=your_key_here
|
||||
```
|
||||
|
||||
## usage
|
||||
|
||||
```bash
|
||||
. .venv/bin/activate
|
||||
python main.py
|
||||
```
|
||||
|
||||
cli will guide you through:
|
||||
1. selecting a project (or creating a new one)
|
||||
2. choosing translation mode
|
||||
3. running the pipeline
|
||||
|
||||
## project structure
|
||||
|
||||
```
|
||||
projects/
|
||||
your_project/
|
||||
source.mp4 ← put your video here
|
||||
... steps caching
|
||||
result_language.mkv final video with dubbed audio
|
||||
```
|
||||
|
||||
## model cache
|
||||
|
||||
models are downloaded automatically on first run and stored in:
|
||||
|
||||
| Model | Path | Size |
|
||||
|-------|------|------|
|
||||
| Silero TTS | `~/.cache/torch/hub/snakers4_silero-models_master/` | ~40 MB |
|
||||
| SenseVoice ASR | `~/.cache/modelscope/hub/models/iic/SenseVoiceSmall/` | ~900 MB |
|
||||
| FSMN-VAD | `~/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/` | ~4 MB |
|
||||
|
||||
to clear cache:
|
||||
```bash
|
||||
rm -rf ~/.cache/torch/hub/snakers4_silero-models_master/
|
||||
rm -rf ~/.cache/modelscope/hub/models/iic/
|
||||
```
|
||||
|
||||
✅ Certified neuroslop
|
||||
4
dubbing/__init__.py
Normal file
4
dubbing/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from dubbing.pipeline import Pipeline
|
||||
from dubbing.config import settings
|
||||
|
||||
__all__ = ["Pipeline", "settings"]
|
||||
100
dubbing/cli.py
Normal file
100
dubbing/cli.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from InquirerPy import inquirer
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from dubbing.config import settings
|
||||
from dubbing.models import StepStatus, Language
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
GEMINI_MODELS = {
|
||||
"gemini-2.0-flash-lite": "Flash Lite (faster, cheaper)",
|
||||
"gemini-2.5-flash": "Flash (better quality)",
|
||||
}
|
||||
|
||||
LANGUAGES = {
|
||||
Language.RU: "Russian (Zh→Ru)",
|
||||
Language.EN: "English (Zh→En)",
|
||||
Language.EN_RU: "Russian via English (Zh→En→Ru)",
|
||||
}
|
||||
|
||||
|
||||
def select_language() -> Language:
|
||||
choices = [{"name": desc, "value": lang} for lang, desc in LANGUAGES.items()]
|
||||
return inquirer.select(
|
||||
message="Select target language:",
|
||||
choices=choices,
|
||||
default=Language.RU,
|
||||
).execute()
|
||||
|
||||
|
||||
def select_model() -> str:
|
||||
choices = [
|
||||
{"name": desc, "value": model_id} for model_id, desc in GEMINI_MODELS.items()
|
||||
]
|
||||
return inquirer.select(
|
||||
message="Select Gemini model:",
|
||||
choices=choices,
|
||||
default="gemini-2.0-flash-lite",
|
||||
).execute()
|
||||
|
||||
|
||||
def get_projects() -> list[str]:
|
||||
settings.projects_dir.mkdir(parents=True, exist_ok=True)
|
||||
return sorted(
|
||||
[
|
||||
d.name
|
||||
for d in settings.projects_dir.iterdir()
|
||||
if d.is_dir() and (d / "source.mp4").exists()
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def select_project() -> str | None:
|
||||
projects = get_projects()
|
||||
if not projects:
|
||||
console.print("[red]No projects found in projects/ directory[/]")
|
||||
console.print("Create a folder with source.mp4 inside projects/")
|
||||
return None
|
||||
|
||||
return inquirer.select(
|
||||
message="Select project:",
|
||||
choices=projects,
|
||||
).execute()
|
||||
|
||||
|
||||
def display_cache_status(statuses: list[StepStatus]) -> None:
|
||||
table = Table(title="Cache Status", show_header=False, box=None)
|
||||
table.add_column("Step", style="cyan")
|
||||
table.add_column("Status")
|
||||
|
||||
for status in statuses:
|
||||
icon = "[green]Cached[/]" if status.cached else "[yellow]Missing[/]"
|
||||
table.add_row(f"├─ {status.name}", icon)
|
||||
|
||||
console.print(table)
|
||||
|
||||
|
||||
def select_cache_strategy(statuses: list[StepStatus]) -> int:
|
||||
choices = [{"name": "Use all cache (Recommended)", "value": -1}]
|
||||
|
||||
for i, status in enumerate(statuses):
|
||||
choices.append(
|
||||
{
|
||||
"name": f"Rebuild from: {status.name}",
|
||||
"value": i,
|
||||
}
|
||||
)
|
||||
|
||||
return inquirer.select(
|
||||
message="Select cache strategy:",
|
||||
choices=choices,
|
||||
default=-1,
|
||||
).execute()
|
||||
|
||||
|
||||
def confirm_run() -> bool:
|
||||
return inquirer.confirm(
|
||||
message="Start pipeline?",
|
||||
default=True,
|
||||
).execute()
|
||||
65
dubbing/config.py
Normal file
65
dubbing/config.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class TTSConfig:
|
||||
"""Language-specific TTS configuration."""
|
||||
|
||||
# Russian: Silero v4_ru
|
||||
RU_ENGINE = "silero"
|
||||
RU_SAMPLE_RATE = 48000
|
||||
RU_VOICE = "xenia"
|
||||
|
||||
# English: Edge TTS
|
||||
EN_ENGINE = "edge-tts"
|
||||
EN_VOICE = "en-US-GuyNeural"
|
||||
|
||||
|
||||
class TranslationPrompts:
|
||||
"""Language-specific translation prompts."""
|
||||
|
||||
RU = """Ты переводчик субтитров с китайского на русский.
|
||||
Переводи точно по смыслу, но со слегка необычным выбором слов:
|
||||
- Используй чуть более формальные или технические синонимы для глаголов, с шансом 1/10
|
||||
- Например: "получить" → "обрести", "сказать" → "произнести"
|
||||
- НЕ используй архаизмы
|
||||
- Не добавляй лишних слов, переводи компактно
|
||||
- Иногда используй причастные обороты вместо придаточных
|
||||
- ОБЯЗАТЕЛЬНО используй букву ё везде где нужно (её, ещё, всё, моё, твоё, зелёный и т.д.)
|
||||
- Глагол to be НЕ переводи буквально: "I am happy" → "я счастлив" (НЕ "я есть счастлив")
|
||||
ВАЖНО: Все числа пиши словами (например: "42" → "сорок два", "1000" → "одна тысяча")."""
|
||||
|
||||
EN = """You are a subtitle translator from Chinese to English.
|
||||
Translate naturally, keep conversational style.
|
||||
IMPORTANT: Write all numbers as words (e.g., "42" → "forty-two", "1000" → "one thousand")."""
|
||||
|
||||
# Two-stage translation: Chinese -> English -> Russian
|
||||
EN_RU_STAGE1 = """You are a subtitle translator from Chinese to English.
|
||||
Translate naturally, keep conversational style.
|
||||
IMPORTANT: Write all numbers as words (e.g., "42" → "forty-two", "1000" → "one thousand")."""
|
||||
|
||||
EN_RU_STAGE2 = """Ты переводчик субтитров с английского на русский.
|
||||
Переводи МАКСИМАЛЬНО ДОСЛОВНО, слово в слово:
|
||||
- Сохраняй порядок слов как в английском где возможно
|
||||
- Переводи идиомы буквально (например: "break a leg" → "сломай ногу")
|
||||
- НЕ адаптируй под русский язык, просто переводи каждое слово
|
||||
- НЕ используй архаизмы
|
||||
- ОБЯЗАТЕЛЬНО используй букву ё везде где нужно (её, ещё, всё, моё, твоё, зелёный и т.д.)
|
||||
- ИСКЛЮЧЕНИЕ: глагол to be НЕ переводи как "есть/является": "I am happy" → "я счастлив" (НЕ "я есть счастлив")
|
||||
ВАЖНО: Все числа пиши словами (например: "42" → "сорок два")."""
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
gemini_api_key: str = ""
|
||||
projects_dir: Path = Path(__file__).parent.parent / "projects"
|
||||
original_volume_db: int = -20
|
||||
tts_sample_rate: int = 48000
|
||||
tts_voice: str = "xenia"
|
||||
max_speedup: float = 3.0
|
||||
translation_chunk_size: int = 100
|
||||
translation_concurrency: int = 20
|
||||
|
||||
model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
|
||||
|
||||
|
||||
settings = Settings()
|
||||
68
dubbing/models.py
Normal file
68
dubbing/models.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Language(str, Enum):
|
||||
RU = "ru"
|
||||
EN = "en"
|
||||
EN_RU = "en_ru" # Chinese -> English -> Russian
|
||||
|
||||
|
||||
class Segment(BaseModel):
|
||||
start: int
|
||||
end: int
|
||||
text: str
|
||||
|
||||
|
||||
class TranslatedSegment(Segment):
|
||||
translated: str
|
||||
|
||||
|
||||
class Translation(BaseModel):
|
||||
id: int
|
||||
translated: str
|
||||
|
||||
|
||||
class TranslationBatch(BaseModel):
|
||||
translations: list[Translation]
|
||||
|
||||
|
||||
class TTSSegment(TranslatedSegment):
|
||||
audio_path: Path
|
||||
audio_duration_ms: float
|
||||
|
||||
|
||||
class ProjectPaths(BaseModel):
|
||||
root: Path
|
||||
source_video: Path
|
||||
source_audio: Path
|
||||
segments_json: Path
|
||||
translated_json: Path
|
||||
tts_dir: Path
|
||||
dubbed_audio: Path
|
||||
result_video: Path
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
@classmethod
|
||||
def from_project(
|
||||
cls, project_dir: Path, language: Language = Language.RU
|
||||
) -> "ProjectPaths":
|
||||
lang = language.value
|
||||
return cls(
|
||||
root=project_dir,
|
||||
source_video=project_dir / "source.mp4",
|
||||
source_audio=project_dir / "source.mp3",
|
||||
segments_json=project_dir / "segments.json",
|
||||
translated_json=project_dir / f"translated_{lang}.json",
|
||||
tts_dir=project_dir / f"tts_{lang}",
|
||||
dubbed_audio=project_dir / f"dubbed_{lang}.mp3",
|
||||
result_video=project_dir / f"result_{lang}.mkv",
|
||||
)
|
||||
|
||||
|
||||
class StepStatus(BaseModel):
|
||||
name: str
|
||||
cached: bool
|
||||
output_exists: bool
|
||||
71
dubbing/pipeline.py
Normal file
71
dubbing/pipeline.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from rich.console import Console
|
||||
from dubbing.models import ProjectPaths, StepStatus, Language
|
||||
from dubbing.steps import (
|
||||
ExtractAudioStep,
|
||||
ASRStep,
|
||||
TranslateStep,
|
||||
TTSStep,
|
||||
MixStep,
|
||||
FinalizeStep,
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class Pipeline:
|
||||
def __init__(
|
||||
self,
|
||||
project_name: str,
|
||||
model_name: str = "gemini-2.0-flash-lite",
|
||||
language: Language = Language.RU,
|
||||
):
|
||||
from dubbing.config import settings
|
||||
|
||||
project_dir = settings.projects_dir / project_name
|
||||
self.paths = ProjectPaths.from_project(project_dir, language)
|
||||
self.model_name = model_name
|
||||
self.language = language
|
||||
self.steps = self._create_steps()
|
||||
|
||||
def _create_steps(self) -> list:
|
||||
return [
|
||||
ExtractAudioStep(self.paths),
|
||||
ASRStep(self.paths),
|
||||
TranslateStep(self.paths, self.model_name, self.language),
|
||||
TTSStep(self.paths, self.language),
|
||||
MixStep(self.paths),
|
||||
FinalizeStep(self.paths),
|
||||
]
|
||||
|
||||
def get_cache_status(self) -> list[StepStatus]:
|
||||
return [
|
||||
StepStatus(
|
||||
name=step.name,
|
||||
cached=step.is_cached(),
|
||||
output_exists=step.is_cached(),
|
||||
)
|
||||
for step in self.steps
|
||||
]
|
||||
|
||||
def clean_from_step(self, step_index: int) -> None:
|
||||
for step in self.steps[step_index:]:
|
||||
step.clean()
|
||||
|
||||
async def run(self, rebuild_from: int = -1) -> None:
|
||||
if rebuild_from >= 0:
|
||||
console.print(
|
||||
f"[yellow]Cleaning from step: {self.steps[rebuild_from].name}[/]"
|
||||
)
|
||||
self.clean_from_step(rebuild_from)
|
||||
|
||||
for i, step in enumerate(self.steps):
|
||||
if step.is_cached():
|
||||
console.print(f"[dim]Skipping {step.name} (cached)[/]")
|
||||
continue
|
||||
|
||||
console.print(
|
||||
f"\n[bold cyan]Step {i + 1}/{len(self.steps)}: {step.name}[/]"
|
||||
)
|
||||
await step.run()
|
||||
|
||||
console.print("\n[bold green]Pipeline completed![/]")
|
||||
15
dubbing/steps/__init__.py
Normal file
15
dubbing/steps/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from dubbing.steps.extract_audio import ExtractAudioStep
|
||||
from dubbing.steps.asr import ASRStep
|
||||
from dubbing.steps.translate import TranslateStep
|
||||
from dubbing.steps.tts import TTSStep
|
||||
from dubbing.steps.mix import MixStep
|
||||
from dubbing.steps.finalize import FinalizeStep
|
||||
|
||||
__all__ = [
|
||||
"ExtractAudioStep",
|
||||
"ASRStep",
|
||||
"TranslateStep",
|
||||
"TTSStep",
|
||||
"MixStep",
|
||||
"FinalizeStep",
|
||||
]
|
||||
78
dubbing/steps/asr.py
Normal file
78
dubbing/steps/asr.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import json
|
||||
from rich.console import Console
|
||||
from dubbing.steps.base import PipelineStep
|
||||
from dubbing.models import Segment
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class ASRStep(PipelineStep):
|
||||
name = "ASR"
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
return self.paths.segments_json.exists()
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.segments_json.exists():
|
||||
self.paths.segments_json.unlink()
|
||||
|
||||
def _group_chinese_segments(
|
||||
self, words: list[str], timestamps: list[list[int]]
|
||||
) -> list[Segment]:
|
||||
segments = []
|
||||
current_text = ""
|
||||
current_start = None
|
||||
current_end = None
|
||||
punctuation = {"。", ",", "!", "?", ";", ":", "…", "、"}
|
||||
|
||||
for word, ts in zip(words, timestamps):
|
||||
if current_start is None:
|
||||
current_start = ts[0]
|
||||
current_text += word
|
||||
current_end = ts[1]
|
||||
|
||||
if word in punctuation:
|
||||
segments.append(
|
||||
Segment(start=current_start, end=current_end, text=current_text)
|
||||
)
|
||||
current_text = ""
|
||||
current_start = None
|
||||
|
||||
if current_text:
|
||||
segments.append(
|
||||
Segment(start=current_start, end=current_end, text=current_text)
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
async def run(self) -> None:
|
||||
console.print("[cyan]Running speech recognition...[/]")
|
||||
|
||||
from funasr import AutoModel
|
||||
|
||||
model = AutoModel(
|
||||
model="iic/SenseVoiceSmall",
|
||||
device="mps",
|
||||
vad_model="fsmn-vad",
|
||||
vad_kwargs={"max_single_segment_time": 30000},
|
||||
)
|
||||
|
||||
result = model.generate(
|
||||
input=str(self.paths.source_audio),
|
||||
language="zh",
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_length_s=15,
|
||||
output_timestamp=True,
|
||||
)
|
||||
|
||||
segments = self._group_chinese_segments(
|
||||
result[0]["words"], result[0]["timestamp"]
|
||||
)
|
||||
|
||||
with open(self.paths.segments_json, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
[s.model_dump() for s in segments], f, ensure_ascii=False, indent=2
|
||||
)
|
||||
|
||||
console.print(f"[green]✓ Found {len(segments)} segments[/]")
|
||||
21
dubbing/steps/base.py
Normal file
21
dubbing/steps/base.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dubbing.models import ProjectPaths
|
||||
|
||||
|
||||
class PipelineStep(ABC):
|
||||
name: str = "Step"
|
||||
|
||||
def __init__(self, paths: ProjectPaths):
|
||||
self.paths = paths
|
||||
|
||||
@abstractmethod
|
||||
def is_cached(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean(self) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def run(self) -> None:
|
||||
pass
|
||||
38
dubbing/steps/extract_audio.py
Normal file
38
dubbing/steps/extract_audio.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import subprocess
|
||||
from rich.console import Console
|
||||
from dubbing.steps.base import PipelineStep
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class ExtractAudioStep(PipelineStep):
|
||||
name = "Extract audio"
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
return self.paths.source_audio.exists()
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.source_audio.exists():
|
||||
self.paths.source_audio.unlink()
|
||||
|
||||
async def run(self) -> None:
|
||||
console.print("[cyan]Extracting audio from video...[/]")
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(self.paths.source_video),
|
||||
"-vn",
|
||||
"-acodec",
|
||||
"libmp3lame",
|
||||
"-q:a",
|
||||
"2",
|
||||
str(self.paths.source_audio),
|
||||
],
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
console.print("[green]✓ Audio extracted[/]")
|
||||
44
dubbing/steps/finalize.py
Normal file
44
dubbing/steps/finalize.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import subprocess
|
||||
from rich.console import Console
|
||||
from dubbing.steps.base import PipelineStep
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class FinalizeStep(PipelineStep):
|
||||
name = "Finalize"
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
return self.paths.result_video.exists()
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.result_video.exists():
|
||||
self.paths.result_video.unlink()
|
||||
|
||||
async def run(self) -> None:
|
||||
console.print("[cyan]Creating final video...[/]")
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(self.paths.source_video),
|
||||
"-i",
|
||||
str(self.paths.dubbed_audio),
|
||||
"-map",
|
||||
"0:v",
|
||||
"-map",
|
||||
"1:a",
|
||||
"-c:v",
|
||||
"copy",
|
||||
"-c:a",
|
||||
"copy",
|
||||
"-shortest",
|
||||
str(self.paths.result_video),
|
||||
],
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
console.print(f"[green]✓ Created {self.paths.result_video}[/]")
|
||||
156
dubbing/steps/mix.py
Normal file
156
dubbing/steps/mix.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import json
|
||||
import subprocess
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from pydub import AudioSegment
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
BarColumn,
|
||||
TaskProgressColumn,
|
||||
)
|
||||
from dubbing.steps.base import PipelineStep
|
||||
from dubbing.models import TranslatedSegment
|
||||
from dubbing.config import settings
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class MixStep(PipelineStep):
|
||||
name = "Mix"
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
return self.paths.dubbed_audio.exists()
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.dubbed_audio.exists():
|
||||
self.paths.dubbed_audio.unlink()
|
||||
for f in self.paths.tts_dir.glob("*_fast.wav"):
|
||||
f.unlink()
|
||||
|
||||
def _load_translated(self) -> list[TranslatedSegment]:
|
||||
with open(self.paths.translated_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return [TranslatedSegment(**s) for s in data]
|
||||
|
||||
def _speedup_file(self, input_path: Path, output_path: Path, speed: float) -> None:
|
||||
filters = []
|
||||
remaining = speed
|
||||
while remaining > 1.0:
|
||||
if remaining > 2.0:
|
||||
filters.append("atempo=2.0")
|
||||
remaining /= 2.0
|
||||
else:
|
||||
filters.append(f"atempo={remaining:.3f}")
|
||||
remaining = 1.0
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(input_path),
|
||||
"-filter:a",
|
||||
",".join(filters),
|
||||
"-vn",
|
||||
str(output_path),
|
||||
],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
def _process_segment(
|
||||
self, i: int, seg: TranslatedSegment, target_sr: int, channels: int
|
||||
) -> tuple[int, np.ndarray | None]:
|
||||
audio_path = self.paths.tts_dir / f"seg_{i:04d}.wav"
|
||||
fast_path = self.paths.tts_dir / f"seg_{i:04d}_fast.wav"
|
||||
|
||||
if not audio_path.exists():
|
||||
return (int(seg.start), None)
|
||||
|
||||
if fast_path.exists():
|
||||
data, sr = sf.read(str(fast_path), dtype="int16")
|
||||
else:
|
||||
data, sr = sf.read(str(audio_path), dtype="int16")
|
||||
duration_ms = len(data) / sr * 1000
|
||||
available_ms = seg.end - seg.start
|
||||
|
||||
if duration_ms > available_ms > 100:
|
||||
speedup_ratio = min(duration_ms / available_ms, settings.max_speedup)
|
||||
self._speedup_file(audio_path, fast_path, speedup_ratio)
|
||||
data, sr = sf.read(str(fast_path), dtype="int16")
|
||||
|
||||
if len(data.shape) == 1 and channels == 2:
|
||||
data = np.column_stack([data, data])
|
||||
elif len(data.shape) == 2 and channels == 1:
|
||||
data = data[:, 0]
|
||||
|
||||
if sr != target_sr:
|
||||
ratio = target_sr / sr
|
||||
new_len = int(len(data) * ratio)
|
||||
indices = np.linspace(0, len(data) - 1, new_len).astype(int)
|
||||
data = data[indices]
|
||||
|
||||
return (int(seg.start), data)
|
||||
|
||||
async def run(self) -> None:
|
||||
console.print("[cyan]Mixing audio tracks...[/]")
|
||||
|
||||
segments = self._load_translated()
|
||||
|
||||
original = AudioSegment.from_mp3(str(self.paths.source_audio))
|
||||
original_quiet = original + settings.original_volume_db
|
||||
original_samples = np.array(
|
||||
original_quiet.get_array_of_samples(), dtype=np.float32
|
||||
)
|
||||
sample_rate = original_quiet.frame_rate
|
||||
channels = original_quiet.channels
|
||||
|
||||
if channels == 2:
|
||||
original_samples = original_samples.reshape(-1, 2)
|
||||
|
||||
dubbed_samples = np.zeros_like(original_samples, dtype=np.float32)
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Processing & mixing...", total=len(segments))
|
||||
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
self._process_segment, i, seg, sample_rate, channels
|
||||
): i
|
||||
for i, seg in enumerate(segments)
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
position_ms, data = future.result()
|
||||
if data is not None:
|
||||
start_sample = int(position_ms * sample_rate / 1000)
|
||||
end_sample = min(start_sample + len(data), len(dubbed_samples))
|
||||
length = end_sample - start_sample
|
||||
dubbed_samples[start_sample:end_sample] += data[:length].astype(
|
||||
np.float32
|
||||
)
|
||||
progress.advance(task)
|
||||
|
||||
console.print("[cyan]Exporting MP3...[/]")
|
||||
final_samples = original_samples + dubbed_samples
|
||||
final_samples = np.clip(final_samples, -32768, 32767).astype(np.int16)
|
||||
|
||||
final = AudioSegment(
|
||||
final_samples.tobytes(),
|
||||
frame_rate=sample_rate,
|
||||
sample_width=2,
|
||||
channels=channels,
|
||||
)
|
||||
final.export(str(self.paths.dubbed_audio), format="mp3")
|
||||
|
||||
console.print("[green]✓ Audio mixed[/]")
|
||||
215
dubbing/steps/translate.py
Normal file
215
dubbing/steps/translate.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import asyncio
|
||||
import json
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
BarColumn,
|
||||
TaskProgressColumn,
|
||||
)
|
||||
from pydantic_ai import Agent, NativeOutput
|
||||
from pydantic_ai.models.google import GoogleModel
|
||||
from pydantic_ai.providers.google import GoogleProvider
|
||||
from dubbing.steps.base import PipelineStep
|
||||
from dubbing.models import Segment, TranslatedSegment, TranslationBatch, Language
|
||||
from dubbing.config import settings, TranslationPrompts
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class TranslateStep(PipelineStep):
|
||||
name = "Translate"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
paths,
|
||||
model_name: str = "gemini-2.0-flash-lite",
|
||||
language: Language = Language.RU,
|
||||
):
|
||||
super().__init__(paths)
|
||||
self.model_name = model_name
|
||||
self.language = language
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
return self.paths.translated_json.exists()
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.translated_json.exists():
|
||||
self.paths.translated_json.unlink()
|
||||
|
||||
def _load_segments(self) -> list[Segment]:
|
||||
with open(self.paths.segments_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return [Segment(**s) for s in data]
|
||||
|
||||
def _get_system_prompt(self, stage: int = 1) -> str:
|
||||
if self.language == Language.EN:
|
||||
return TranslationPrompts.EN
|
||||
elif self.language == Language.EN_RU:
|
||||
return (
|
||||
TranslationPrompts.EN_RU_STAGE1
|
||||
if stage == 1
|
||||
else TranslationPrompts.EN_RU_STAGE2
|
||||
)
|
||||
return TranslationPrompts.RU
|
||||
|
||||
def _get_translate_command(self, stage: int = 1) -> str:
|
||||
if self.language == Language.EN:
|
||||
return "Translate:"
|
||||
elif self.language == Language.EN_RU:
|
||||
return "Translate:" if stage == 1 else "Переведи:"
|
||||
return "Переведи:"
|
||||
|
||||
def _get_context_header(self, stage: int = 1) -> str:
|
||||
if self.language == Language.EN:
|
||||
return "Context:"
|
||||
elif self.language == Language.EN_RU:
|
||||
return "Context:" if stage == 1 else "Контекст:"
|
||||
return "Контекст:"
|
||||
|
||||
def _create_agent(self, stage: int = 1) -> Agent:
|
||||
provider = GoogleProvider(api_key=settings.gemini_api_key)
|
||||
model = GoogleModel(self.model_name, provider=provider)
|
||||
|
||||
return Agent(
|
||||
model,
|
||||
output_type=NativeOutput(TranslationBatch),
|
||||
system_prompt=self._get_system_prompt(stage),
|
||||
)
|
||||
|
||||
async def _translate_chunk(
|
||||
self,
|
||||
agent: Agent,
|
||||
chunk: list[Segment],
|
||||
chunk_idx: int,
|
||||
context: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
stage: int = 1,
|
||||
) -> list[TranslatedSegment]:
|
||||
async with semaphore:
|
||||
# For stage 2, use translated field as source
|
||||
if stage == 2:
|
||||
items = "\n".join([f"{i}: {s.translated}" for i, s in enumerate(chunk)])
|
||||
else:
|
||||
items = "\n".join([f"{i}: {s.text}" for i, s in enumerate(chunk)])
|
||||
prompt = f"{context}{self._get_translate_command(stage)}\n\n{items}"
|
||||
|
||||
try:
|
||||
result = await agent.run(prompt)
|
||||
translated = []
|
||||
for i, seg in enumerate(chunk):
|
||||
translated_text = ""
|
||||
for t in result.output.translations:
|
||||
if t.id == i:
|
||||
translated_text = t.translated
|
||||
break
|
||||
translated.append(
|
||||
TranslatedSegment(
|
||||
start=seg.start,
|
||||
end=seg.end,
|
||||
text=seg.text,
|
||||
translated=translated_text
|
||||
or (seg.translated if stage == 2 else seg.text),
|
||||
)
|
||||
)
|
||||
return translated
|
||||
except Exception as e:
|
||||
console.print(f"[red]Chunk {chunk_idx} error: {e}[/]")
|
||||
fallback = seg.translated if stage == 2 else seg.text
|
||||
return [
|
||||
TranslatedSegment(
|
||||
start=s.start, end=s.end, text=s.text, translated=fallback
|
||||
)
|
||||
for s in chunk
|
||||
]
|
||||
|
||||
async def _translate_parallel(
|
||||
self, agent: Agent, segments: list, stage: int = 1, desc: str = "Translating..."
|
||||
) -> list[TranslatedSegment]:
|
||||
chunk_size = settings.translation_chunk_size
|
||||
concurrency = settings.translation_concurrency
|
||||
|
||||
chunks = [
|
||||
segments[i : i + chunk_size] for i in range(0, len(segments), chunk_size)
|
||||
]
|
||||
semaphore = asyncio.Semaphore(concurrency)
|
||||
|
||||
all_results: list[TranslatedSegment] = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task(desc, total=len(chunks))
|
||||
|
||||
for batch_start in range(0, len(chunks), concurrency):
|
||||
batch = chunks[batch_start : batch_start + concurrency]
|
||||
|
||||
context = ""
|
||||
if all_results:
|
||||
prev = all_results[-5:]
|
||||
if stage == 2:
|
||||
ctx_lines = [f"• {s.translated}" for s in prev]
|
||||
else:
|
||||
ctx_lines = [f"• {s.text} → {s.translated}" for s in prev]
|
||||
context = (
|
||||
f"{self._get_context_header(stage)}\n"
|
||||
+ "\n".join(ctx_lines)
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
tasks = [
|
||||
self._translate_chunk(
|
||||
agent, chunk, batch_start + idx, context, semaphore, stage
|
||||
)
|
||||
for idx, chunk in enumerate(batch)
|
||||
]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, list):
|
||||
all_results.extend(result)
|
||||
progress.advance(task)
|
||||
|
||||
return all_results
|
||||
|
||||
async def run(self) -> None:
|
||||
segments = self._load_segments()
|
||||
|
||||
if self.language == Language.EN_RU:
|
||||
# Stage 1: Chinese -> English
|
||||
console.print(f"[cyan]Stage 1: Chinese → English ({self.model_name})...[/]")
|
||||
agent1 = self._create_agent(stage=1)
|
||||
intermediate = await self._translate_parallel(
|
||||
agent1, segments, stage=1, desc="Zh→En..."
|
||||
)
|
||||
|
||||
# Stage 2: English -> Russian
|
||||
console.print(f"[cyan]Stage 2: English → Russian ({self.model_name})...[/]")
|
||||
agent2 = self._create_agent(stage=2)
|
||||
translated = await self._translate_parallel(
|
||||
agent2, intermediate, stage=2, desc="En→Ru..."
|
||||
)
|
||||
else:
|
||||
console.print(
|
||||
f"[cyan]Translating to {self.language.value.upper()} with {self.model_name}...[/]"
|
||||
)
|
||||
agent = self._create_agent()
|
||||
translated = await self._translate_parallel(agent, segments)
|
||||
|
||||
with open(self.paths.translated_json, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
[s.model_dump() for s in translated], f, ensure_ascii=False, indent=2
|
||||
)
|
||||
|
||||
missing = sum(1 for s in translated if s.translated == s.text)
|
||||
if missing:
|
||||
console.print(
|
||||
f"[yellow]Warning: {missing} segments may not be translated[/]"
|
||||
)
|
||||
|
||||
console.print(f"[green]✓ Translated {len(translated)} segments[/]")
|
||||
162
dubbing/steps/tts.py
Normal file
162
dubbing/steps/tts.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import json
|
||||
import re
|
||||
import torch
|
||||
import torchaudio
|
||||
import edge_tts
|
||||
from pathlib import Path
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
BarColumn,
|
||||
TaskProgressColumn,
|
||||
)
|
||||
from dubbing.steps.base import PipelineStep
|
||||
from dubbing.models import TranslatedSegment, Language
|
||||
from dubbing.config import TTSConfig
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class TTSStep(PipelineStep):
|
||||
name = "TTS"
|
||||
|
||||
def __init__(self, paths, language: Language = Language.RU):
|
||||
super().__init__(paths)
|
||||
self.language = language
|
||||
self._silero_model = None
|
||||
|
||||
def is_cached(self) -> bool:
|
||||
if not self.paths.tts_dir.exists():
|
||||
return False
|
||||
return any(self.paths.tts_dir.glob("*.wav"))
|
||||
|
||||
def clean(self) -> None:
|
||||
if self.paths.tts_dir.exists():
|
||||
for f in self.paths.tts_dir.glob("*.wav"):
|
||||
f.unlink()
|
||||
|
||||
def _load_translated(self) -> list[TranslatedSegment]:
|
||||
with open(self.paths.translated_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return [TranslatedSegment(**s) for s in data]
|
||||
|
||||
def _clean_text_russian(self, text: str) -> str | None:
|
||||
"""Clean text for Russian TTS (Silero)."""
|
||||
text = re.sub(r"[^\w\s.,!?;:\-—–\'\"«»а-яА-ЯёЁ]", "", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
if not text or not re.search(r"[а-яА-ЯёЁ]", text):
|
||||
return None
|
||||
return text
|
||||
|
||||
def _clean_text_english(self, text: str) -> str | None:
|
||||
"""Clean text for English TTS (Piper)."""
|
||||
text = re.sub(r"[^\w\s.,!?;:\-—–\'\"a-zA-Z0-9]", "", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
if not text or not re.search(r"[a-zA-Z]", text):
|
||||
return None
|
||||
return text
|
||||
|
||||
def _clean_text(self, text: str) -> str | None:
|
||||
"""Clean text based on language."""
|
||||
if self.language == Language.EN:
|
||||
return self._clean_text_english(text)
|
||||
# RU and EN_RU both output Russian
|
||||
return self._clean_text_russian(text)
|
||||
|
||||
def _load_silero_model(self):
|
||||
"""Load Silero TTS model for Russian."""
|
||||
if self._silero_model is None:
|
||||
console.print("[dim]Loading Silero TTS model...[/]")
|
||||
device = torch.device("cpu")
|
||||
self._silero_model, _ = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="ru",
|
||||
speaker="v4_ru",
|
||||
)
|
||||
self._silero_model.to(device)
|
||||
return self._silero_model
|
||||
|
||||
def _synthesize_russian(self, text: str, output_path: Path) -> bool:
|
||||
"""Synthesize Russian speech using Silero."""
|
||||
model = self._load_silero_model()
|
||||
try:
|
||||
audio = model.apply_tts(
|
||||
text=text,
|
||||
speaker=TTSConfig.RU_VOICE,
|
||||
sample_rate=TTSConfig.RU_SAMPLE_RATE,
|
||||
)
|
||||
torchaudio.save(
|
||||
str(output_path), audio.unsqueeze(0), TTSConfig.RU_SAMPLE_RATE
|
||||
)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
async def _synthesize_english(self, text: str, output_path: Path) -> bool:
|
||||
"""Synthesize English speech using Edge TTS."""
|
||||
try:
|
||||
mp3_path = output_path.with_suffix(".mp3")
|
||||
communicate = edge_tts.Communicate(text, TTSConfig.EN_VOICE)
|
||||
await communicate.save(str(mp3_path))
|
||||
|
||||
# Convert MP3 to WAV using pydub
|
||||
from pydub import AudioSegment
|
||||
|
||||
audio = AudioSegment.from_mp3(str(mp3_path))
|
||||
audio.export(str(output_path), format="wav")
|
||||
mp3_path.unlink()
|
||||
return True
|
||||
except Exception as e:
|
||||
console.print(f"[red]Edge TTS error: {e}[/]")
|
||||
return False
|
||||
|
||||
async def _synthesize(self, text: str, output_path: Path) -> bool:
|
||||
"""Synthesize speech based on language."""
|
||||
if self.language == Language.EN:
|
||||
return await self._synthesize_english(text, output_path)
|
||||
# RU and EN_RU both use Russian TTS
|
||||
return self._synthesize_russian(text, output_path)
|
||||
|
||||
async def run(self) -> None:
|
||||
engine = (
|
||||
TTSConfig.EN_ENGINE if self.language == Language.EN else TTSConfig.RU_ENGINE
|
||||
) # EN_RU uses RU engine
|
||||
console.print(f"[cyan]Generating TTS audio ({engine})...[/]")
|
||||
|
||||
self.paths.tts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
segments = self._load_translated()
|
||||
|
||||
skipped = []
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Generating speech...", total=len(segments))
|
||||
|
||||
for i, seg in enumerate(segments):
|
||||
clean_text = self._clean_text(seg.translated)
|
||||
if not clean_text:
|
||||
skipped.append((i, seg.translated, "no_text"))
|
||||
progress.advance(task)
|
||||
continue
|
||||
|
||||
path = self.paths.tts_dir / f"seg_{i:04d}.wav"
|
||||
success = await self._synthesize(clean_text, path)
|
||||
if not success:
|
||||
skipped.append((i, seg.translated, "tts_error"))
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
generated = len(segments) - len(skipped)
|
||||
console.print(f"[green]✓ Generated {generated} audio files[/]")
|
||||
if skipped:
|
||||
console.print(f"[yellow]Skipped {len(skipped)} segments:[/]")
|
||||
for idx, text, reason in skipped:
|
||||
console.print(f" [dim]{idx}:[/] {text[:60]}... [red]({reason})[/]")
|
||||
61
main.py
Normal file
61
main.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import asyncio
|
||||
from rich.console import Console
|
||||
from dubbing.cli import (
|
||||
select_language,
|
||||
select_model,
|
||||
select_project,
|
||||
display_cache_status,
|
||||
select_cache_strategy,
|
||||
confirm_run,
|
||||
)
|
||||
from dubbing.pipeline import Pipeline
|
||||
|
||||
console = Console()
|
||||
|
||||
TRANSLATE_STEP_INDEX = 2
|
||||
|
||||
|
||||
def main():
|
||||
console.print("[bold]Fucking Chinese Dramas Dubbing Pipeline[/]\n")
|
||||
|
||||
# 1. Select project first
|
||||
project = select_project()
|
||||
if not project:
|
||||
return
|
||||
console.print(f"[dim]Selected project: {project}[/]\n")
|
||||
|
||||
# 2. Select language
|
||||
language = select_language()
|
||||
console.print(f"[dim]Selected language: {language.value.upper()}[/]\n")
|
||||
|
||||
# 3. Create pipeline with dummy model to check cache
|
||||
pipeline = Pipeline(project, "gemini-2.0-flash-lite", language)
|
||||
|
||||
statuses = pipeline.get_cache_status()
|
||||
display_cache_status(statuses)
|
||||
console.print()
|
||||
|
||||
# 4. Select cache strategy
|
||||
rebuild_from = select_cache_strategy(statuses)
|
||||
|
||||
# 5. Only ask for model if translation needs to run
|
||||
translate_cached = statuses[TRANSLATE_STEP_INDEX].cached
|
||||
needs_translation = (
|
||||
rebuild_from >= 0 and rebuild_from <= TRANSLATE_STEP_INDEX
|
||||
) or (rebuild_from == -1 and not translate_cached)
|
||||
|
||||
if needs_translation:
|
||||
model = select_model()
|
||||
console.print(f"[dim]Selected model: {model}[/]\n")
|
||||
pipeline = Pipeline(project, model, language)
|
||||
|
||||
if not confirm_run():
|
||||
console.print("[yellow]Cancelled[/]")
|
||||
return
|
||||
|
||||
console.print()
|
||||
asyncio.run(pipeline.run(rebuild_from))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
20
pyproject.toml
Normal file
20
pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "chinese-dramas"
|
||||
version = "0.1.0"
|
||||
description = "Auto-dub Chinese videos to Russian/English using AI"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"edge-tts>=7.2.7",
|
||||
"funasr>=1.3.1",
|
||||
"inquirerpy>=0.3.4",
|
||||
"modelscope>=1.34.0",
|
||||
"pydantic-ai-slim[google]>=1.51.0",
|
||||
"pydantic-settings>=2.0",
|
||||
"pydub>=0.25.1",
|
||||
"rich>=14.3.1",
|
||||
"soundfile>=0.13.0",
|
||||
"torch>=2.10.0",
|
||||
"torchaudio>=2.10.0",
|
||||
"torchcodec>=0.10.0",
|
||||
]
|
||||
Reference in New Issue
Block a user