"""geo_mouth.py - make a flat, geometric character talk by DRAWING the mouth.

For a Peppa-style character (just shapes and lines) you don't need any AI
at all. The mouth is a shape you can draw yourself - so draw it, and tie
how far it opens to how LOUD the voice is at each moment: loud -> wide
open, quiet -> closed. That is real, automatic lip-flap with nothing but
Pillow + ffmpeg.

Run:
    python3 geo_mouth.py character.png voice.mp3 talk.mp4

character.png should have a clear, empty spot where the mouth goes (draw
your character with NO mouth). Set the mouth position, size and colors in
the knobs below to fit your character, then run again until it sits right.

Needs:  pip3 install pillow numpy   (and the free ffmpeg tool)
"""
import os
import subprocess
import sys
import wave

import numpy as np
from PIL import Image, ImageDraw

# ---- knobs: where the mouth is + how it looks --------------------------------
MOUTH_CX, MOUTH_CY = 0.50, 0.62   # mouth centre, as a fraction of the picture
MOUTH_W            = 0.17          # mouth width, fraction of the picture width
MIN_OPEN           = 0.010         # closed height (a thin line), fraction of H
MAX_OPEN           = 0.130         # widest-open height, fraction of H
LIP   = (95, 30, 40, 255)         # mouth outline / lip colour
INNER = (140, 40, 55, 255)        # the dark inside of the mouth
TONGUE = (228, 120, 140, 255)     # little tongue when the mouth is wide
FPS    = 25
SMOOTH = 0.45                     # 0 = snappy/jittery, ~0.6 = smooth/laggy
# -----------------------------------------------------------------------------


def audio_envelope(path, fps):
    """Loudness per video-frame: decode to mono 16 kHz, take RMS per window."""
    subprocess.run(["ffmpeg", "-y", "-v", "error", "-i", path,
                    "-ac", "1", "-ar", "16000", "_geo.wav"], check=True)
    with wave.open("_geo.wav") as w:
        sr = w.getframerate()
        a = np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16)
    a = a.astype(np.float32) / 32768.0
    hop = max(1, sr // fps)
    frames = max(1, len(a) // hop)
    env = np.array([np.sqrt(np.mean(a[i * hop:(i + 1) * hop] ** 2) + 1e-9)
                    for i in range(frames)])
    env = env / (env.max() + 1e-9)     # normalise to 0..1
    env = env ** 0.6                    # perceptual: the mouth opens a bit sooner
    # attack/decay smoothing so the jaw doesn't chatter on every tiny bump
    out, prev = np.zeros_like(env), 0.0
    for i, v in enumerate(env):
        prev = SMOOTH * prev + (1 - SMOOTH) * v
        out[i] = prev
    return out


def main():
    char, voice, out = sys.argv[1:4]
    base = Image.open(char).convert("RGBA")
    W, H = base.size
    env = audio_envelope(voice, FPS)

    cx, cy, mw = int(MOUTH_CX * W), int(MOUTH_CY * H), int(MOUTH_W * W)
    lw = max(3, W // 220)
    os.makedirs("_geo_frames", exist_ok=True)

    for i, loud in enumerate(env):
        open_h = int((MIN_OPEN + (MAX_OPEN - MIN_OPEN) * loud) * H)
        fr = base.copy()
        d = ImageDraw.Draw(fr)
        box = [cx - mw // 2, cy - open_h // 2, cx + mw // 2, cy + open_h // 2]
        d.ellipse(box, fill=INNER, outline=LIP, width=lw)          # the mouth
        if open_h > 0.045 * H:                                     # a tongue when wide
            tw, th = mw * 0.6, open_h * 0.34
            d.ellipse([cx - tw / 2, box[3] - th - lw,
                       cx + tw / 2, box[3] - lw * 0.5], fill=TONGUE)
        fr.convert("RGB").save(f"_geo_frames/{i + 1:05d}.png")

    subprocess.run(["ffmpeg", "-y", "-v", "error", "-framerate", str(FPS),
                    "-i", "_geo_frames/%05d.png", "-i", voice,
                    "-map", "0:v", "-map", "1:a",
                    "-c:v", "libx264", "-pix_fmt", "yuv420p", "-movflags", "+faststart",
                    "-c:a", "aac", "-shortest", out], check=True)
    print("done ->", out)


if __name__ == "__main__":
    main()
