"""mouth_transplant.py - graft an on-style talking mouth onto your scene.

The idea: Wav2Lip's mouth is soft, and GFPGAN's mouth drifts toward a
realistic style. Instead, generate a crisp talking clip of the SAME
character with an image-to-video model (keep the HEAD STILL!), then copy
just the mouth rectangle from that clip onto your scene, frame by frame,
with a soft feathered edge so there's no seam. Lay your real voice on top
afterwards (the page shows the one ffmpeg line for that).

Run:
    python3 mouth_transplant.py talking.mp4 scene.mp4 out.mp4

Both videos should show the character at the SAME size and position, with
the head locked in place. Tweak the MOUTH BOX below to fit your character
(measure it once in any image viewer), then run again until it lines up.

Needs:  pip3 install opencv-python numpy   (and the free ffmpeg tool)
"""
import os
import subprocess
import sys

import cv2
import numpy as np

# ---- knobs: where the mouth is, as a fraction of the frame (0..1) -----------
# x,y is the top-left of the mouth box; w,h is its size. Defaults suit a
# head-and-shoulders shot with the face centered.
MOUTH_X, MOUTH_Y = 0.38, 0.60
MOUTH_W, MOUTH_H = 0.24, 0.22
FEATHER = 0.35     # 0 = hard edge, ~0.4 = soft blend. Higher hides the seam.
FPS = 25
# -----------------------------------------------------------------------------


def frames_of(path, outdir):
    os.makedirs(outdir, exist_ok=True)
    subprocess.run(["ffmpeg", "-y", "-v", "error", "-i", path,
                    f"{outdir}/%05d.png"], check=True)
    return sorted(f"{outdir}/{n}" for n in os.listdir(outdir))


def feather_mask(w, h, fx, fy):
    """A soft-edged white box: 1 in the middle, fading to 0 at the border."""
    ax = np.linspace(-1, 1, w)
    ay = np.linspace(-1, 1, h)
    gx = np.clip((1 - np.abs(ax)) / max(fx, 1e-3), 0, 1)
    gy = np.clip((1 - np.abs(ay)) / max(fy, 1e-3), 0, 1)
    return (gy[:, None] * gx[None, :]).astype(np.float32)


def main():
    talking, scene, out = sys.argv[1:4]
    t_frames = frames_of(talking, "_talk_frames")
    s_frames = frames_of(scene, "_scene_frames")
    n = min(len(t_frames), len(s_frames))
    print(f"talking={len(t_frames)} scene={len(s_frames)} -> using {n} frames")

    os.makedirs("_out_frames", exist_ok=True)
    for i in range(n):
        scene_img = cv2.imread(s_frames[i]).astype(np.float32)
        talk_img = cv2.imread(t_frames[i])
        H, W = scene_img.shape[:2]
        # talking clip may be a different size - match it to the scene
        talk_img = cv2.resize(talk_img, (W, H)).astype(np.float32)

        x, y = int(MOUTH_X * W), int(MOUTH_Y * H)
        w, h = int(MOUTH_W * W), int(MOUTH_H * H)
        m = feather_mask(w, h, FEATHER, FEATHER)[:, :, None]

        roi_scene = scene_img[y:y + h, x:x + w]
        roi_talk = talk_img[y:y + h, x:x + w]
        scene_img[y:y + h, x:x + w] = roi_talk * m + roi_scene * (1 - m)

        cv2.imwrite(f"_out_frames/{i + 1:05d}.png", scene_img.astype(np.uint8))
        if i % 25 == 0:
            print(f"  {i}/{n}")

    # stitch back, carry the SCENE's audio if it has any (-shortest is safe)
    subprocess.run(["ffmpeg", "-y", "-v", "error",
                    "-framerate", str(FPS), "-i", "_out_frames/%05d.png",
                    "-i", scene, "-map", "0:v", "-map", "1:a?",
                    "-c:v", "libx264", "-pix_fmt", "yuv420p", "-movflags", "+faststart",
                    "-c:a", "aac", "-shortest", out], check=True)
    print("done ->", out, "(now lay your real voice over it - see the page)")


if __name__ == "__main__":
    main()
