From 08b6543b792161e4606069726823a43dbce62234 Mon Sep 17 00:00:00 2001
From: wsy182 <2392948297@qq.com>
Date: Mon, 15 Jun 2026 23:13:36 +0800
Subject: [PATCH] =?UTF-8?q?perf(video):=20=E4=BC=98=E5=8C=96=E8=A7=86?=
 =?UTF-8?q?=E9=A2=91=E5=A4=84=E7=90=86=E6=80=A7=E8=83=BD=E7=9B=91=E6=8E=A7?=
 =?UTF-8?q?=E5=92=8C=E9=9F=B3=E9=A2=91=E6=92=AD=E6=94=BE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加视频处理性能计时和统计功能
- 实现帧处理时间监控和慢帧警告
- 添加音频文件静音修剪功能
- 优化Windows平台音频播放实现
- 调整默认日志输出频率减少冗余信息
- 修复MediaPipe GPU委托在Windows上的兼容性问题
---
 app/audio/generate.py              | 138 ++++++++++++++++++++++++-----
 app/audio/rep_announcer.py         |  87 ++++++++++++------
 app/core/lifecycle.py              |   3 +
 app/exercises/dead_bug/detector.py |  75 ++++++++++++++++
 app/vision/pose_landmarker.py      |   6 ++
 app/webrtc/video_receiver.py       | 112 ++++++++++++++++++++++-
 config.yaml                        |   7 +-
 configs/models.py                  |   7 +-
 8 files changed, 387 insertions(+), 48 deletions(-)

diff --git a/app/audio/generate.py b/app/audio/generate.py
index e13ed73..1b68e9f 100644
--- a/app/audio/generate.py
+++ b/app/audio/generate.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 import platform
 import shutil
 import subprocess
+import wave
 from pathlib import Path
 
 from loguru import logger
@@ -15,6 +16,9 @@ def generate_rep_audio_files(
     rate: int,
     output_dir: Path,
     overwrite: bool = False,
+    trim_leading_silence: bool = True,
+    trim_silence_threshold: int = 500,
+    trim_silence_padding_ms: int = 20,
 ) -> None:
     """
     确保 0~max_count 的运动次数语音 wav 文件存在。
@@ -40,29 +44,37 @@ def generate_rep_audio_files(
 
     if not missing_counts:
         logger.info("Rep audio files already prepared: {}", output_dir)
-        return
-
-    logger.info(
-        "Preparing rep audio files, system={}, count={}, output_dir={}",
-        system,
-        len(missing_counts),
-        output_dir,
-    )
-
-    if system == "darwin":
-        _generate_with_macos_say(
-            counts=missing_counts,
-            output_dir=output_dir,
-            rate=rate,
-        )
     else:
-        _generate_with_pyttsx3(
-            counts=missing_counts,
-            output_dir=output_dir,
-            rate=rate,
+        logger.info(
+            "Preparing rep audio files, system={}, count={}, output_dir={}",
+            system,
+            len(missing_counts),
+            output_dir,
         )
 
-    logger.info("Rep audio files prepared: {}", output_dir)
+        if system == "darwin":
+            _generate_with_macos_say(
+                counts=missing_counts,
+                output_dir=output_dir,
+                rate=rate,
+            )
+        else:
+            _generate_with_pyttsx3(
+                counts=missing_counts,
+                output_dir=output_dir,
+                rate=rate,
+            )
+
+        logger.info("Rep audio files prepared: {}", output_dir)
+
+    if trim_leading_silence and suffix == ".wav":
+        _trim_leading_silence_files(
+            counts=list(range(0, max_count + 1)),
+            output_dir=output_dir,
+            suffix=suffix,
+            threshold=trim_silence_threshold,
+            padding_ms=trim_silence_padding_ms,
+        )
 
 
 def _generate_with_macos_say(
@@ -127,3 +139,89 @@ def _generate_with_pyttsx3(
 
 def _audio_path(output_dir: Path, count: int, *, suffix: str) -> Path:
     return output_dir / f"{count}{suffix}"
+
+
+def _trim_leading_silence_files(
+    *,
+    counts: list[int],
+    output_dir: Path,
+    suffix: str,
+    threshold: int,
+    padding_ms: int,
+) -> None:
+    trimmed = 0
+    total_removed_ms = 0.0
+
+    for count in counts:
+        audio_file = _audio_path(output_dir, count, suffix=suffix)
+        if not audio_file.exists():
+            continue
+        removed_ms = _trim_leading_silence(audio_file, threshold=threshold, padding_ms=padding_ms)
+        if removed_ms > 0:
+            trimmed += 1
+            total_removed_ms += removed_ms
+
+    logger.info(
+        "Rep audio leading silence trim complete: files_trimmed={}, total_removed_ms={:.1f}, threshold={}, padding_ms={}",
+        trimmed,
+        total_removed_ms,
+        threshold,
+        padding_ms,
+    )
+
+
+def _trim_leading_silence(audio_file: Path, *, threshold: int, padding_ms: int) -> float:
+    with wave.open(str(audio_file), "rb") as reader:
+        params = reader.getparams()
+        frames = reader.readframes(params.nframes)
+
+    frame_size = params.sampwidth * params.nchannels
+    if params.nframes <= 0 or frame_size <= 0:
+        return 0.0
+
+    chunk_frames = max(1, params.framerate // 100)
+    leading_frames = 0
+    offset = 0
+    chunk_size = chunk_frames * frame_size
+
+    while offset < len(frames):
+        chunk = frames[offset : offset + chunk_size]
+        if _pcm_rms(chunk, params.sampwidth) > threshold:
+            break
+        chunk_frame_count = len(chunk) // frame_size
+        leading_frames += chunk_frame_count
+        offset += chunk_size
+
+    padding_frames = int(params.framerate * max(0, padding_ms) / 1000)
+    remove_frames = max(0, leading_frames - padding_frames)
+    if remove_frames <= 0:
+        return 0.0
+
+    start = min(len(frames), remove_frames * frame_size)
+    trimmed_frames = frames[start:]
+    if not trimmed_frames:
+        return 0.0
+
+    with wave.open(str(audio_file), "wb") as writer:
+        writer.setparams(params)
+        writer.writeframes(trimmed_frames)
+
+    return remove_frames / params.framerate * 1000
+
+
+def _pcm_rms(chunk: bytes, sample_width: int) -> float:
+    if not chunk:
+        return 0.0
+
+    if sample_width == 2:
+        sample_count = len(chunk) // 2
+        if sample_count == 0:
+            return 0.0
+        total = 0
+        for i in range(0, sample_count * 2, 2):
+            sample = int.from_bytes(chunk[i : i + 2], "little", signed=True)
+            total += sample * sample
+        return (total / sample_count) ** 0.5
+
+    peak = max(abs(byte - 128) for byte in chunk)
+    return float(peak)
diff --git a/app/audio/rep_announcer.py b/app/audio/rep_announcer.py
index 41a68fa..4580cb7 100644
--- a/app/audio/rep_announcer.py
+++ b/app/audio/rep_announcer.py
@@ -5,6 +5,7 @@ import shutil
 import subprocess
 import sys
 import threading
+import time
 from pathlib import Path
 
 from loguru import logger
@@ -24,12 +25,14 @@ class RepAnnouncer:
         self.max_count = max_count
         self.audio_dir = Path(audio_dir)
 
-        self._queue: queue.Queue[int | None] = queue.Queue()
+        self._queue: queue.Queue[tuple[int, float] | None] = queue.Queue()
         self._thread: threading.Thread | None = None
         self._current_process: subprocess.Popen | None = None
         self._closed = False
+        self._play_lock = threading.Lock()
 
         self._platform = sys.platform
+        self._direct_playback = self._platform.startswith("win")
 
         if self.enabled:
             self._start()
@@ -41,8 +44,26 @@ class RepAnnouncer:
         if count <= 0 or count > self.max_count:
             return
 
+        requested_at = time.perf_counter()
+        if self._direct_playback:
+            audio_file = self._audio_path(count)
+            if not audio_file.exists():
+                logger.warning("Rep audio file missing: {}", audio_file)
+                return
+            try:
+                self._play(audio_file)
+                logger.info(
+                    "Rep audio submitted immediately: count={}, submit_ms={:.1f}",
+                    count,
+                    (time.perf_counter() - requested_at) * 1000,
+                )
+            except Exception as exc:
+                logger.warning("Failed to play rep count {}: {}", count, exc)
+            return
+
         self._clear_pending_counts()
-        self._queue.put(count)
+        self._queue.put((count, requested_at))
+        logger.info("Rep audio queued: count={}", count)
 
     def close(self) -> None:
         """停止播报线程并释放资源"""
@@ -62,17 +83,24 @@ class RepAnnouncer:
         """启动后台播报线程"""
         self.audio_dir.mkdir(parents=True, exist_ok=True)
 
+        if self._direct_playback:
+            import winsound
+
+            logger.info("Rep announcer initialized in direct Windows mode, audio_dir={}", self.audio_dir)
+            return
+
         self._thread = threading.Thread(target=self._run, name="RepAnnouncer", daemon=True)
         self._thread.start()
 
-        logger.info("Rep announcer initialized, audio_dir={}", self.audio_dir)
+        logger.info("Rep announcer initialized in queued mode, audio_dir={}", self.audio_dir)
 
     def _run(self) -> None:
         """后台线程：从队列取次数，播放对应音频文件"""
         while True:
-            count = self._queue.get()
-            if count is None:
+            item = self._queue.get()
+            if item is None:
                 return
+            count, requested_at = item
 
             audio_file = self._audio_path(count)
             if not audio_file.exists():
@@ -81,33 +109,42 @@ class RepAnnouncer:
 
             try:
                 self._play(audio_file)
+                logger.info(
+                    "Rep audio submitted from queue: count={}, queue_ms={:.1f}",
+                    count,
+                    (time.perf_counter() - requested_at) * 1000,
+                )
             except Exception as exc:
                 logger.warning("Failed to play rep count {}: {}", count, exc)
 
     def _play(self, audio_file: Path) -> None:
         """播放音频文件（平台自适应）"""
-        self._stop_current_playback()
+        with self._play_lock:
+            self._stop_current_playback()
 
-        if self._platform == "darwin":
-            self._current_process = subprocess.Popen(
-                ["afplay", str(audio_file)],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-        elif self._platform.startswith("win"):
-            import winsound
+            if self._platform == "darwin":
+                self._current_process = subprocess.Popen(
+                    ["afplay", str(audio_file)],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
+            elif self._platform.startswith("win"):
+                import winsound
 
-            winsound.PlaySound(str(audio_file), winsound.SND_FILENAME | winsound.SND_ASYNC)
-        else:
-            player = shutil.which("paplay") or shutil.which("aplay")
-            if player is None:
-                logger.warning("No audio player found")
-                return
-            self._current_process = subprocess.Popen(
-                [player, str(audio_file)],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
+                winsound.PlaySound(
+                    str(audio_file),
+                    winsound.SND_FILENAME | winsound.SND_ASYNC | winsound.SND_NODEFAULT,
+                )
+            else:
+                player = shutil.which("paplay") or shutil.which("aplay")
+                if player is None:
+                    logger.warning("No audio player found")
+                    return
+                self._current_process = subprocess.Popen(
+                    [player, str(audio_file)],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
 
     def _stop_current_playback(self) -> None:
         """中断当前正在播放的声音"""
diff --git a/app/core/lifecycle.py b/app/core/lifecycle.py
index 4edfa3b..be2829f 100644
--- a/app/core/lifecycle.py
+++ b/app/core/lifecycle.py
@@ -16,4 +16,7 @@ def startup() -> None:
         rate=config.audio.rep_announcer_rate,
         output_dir=config.audio.resolved_audio_dir,
         overwrite=False,
+        trim_leading_silence=config.audio.trim_leading_silence,
+        trim_silence_threshold=config.audio.trim_silence_threshold,
+        trim_silence_padding_ms=config.audio.trim_silence_padding_ms,
     )
diff --git a/app/exercises/dead_bug/detector.py b/app/exercises/dead_bug/detector.py
index c51f8b1..f95fbdc 100644
--- a/app/exercises/dead_bug/detector.py
+++ b/app/exercises/dead_bug/detector.py
@@ -52,6 +52,7 @@ class DeadBugDetector:
         self._result_event = threading.Event()
         self._inflight = False
         self._inflight_started_at = 0.0
+        self.last_timing: dict[str, float | bool] = {}
 
         def on_result(pose_result, _image, _timestamp_ms):
             with self._result_lock:
@@ -79,7 +80,9 @@ class DeadBugDetector:
 
     def process_frame(self, bgr_frame: np.ndarray, timestamp_ms: int) -> tuple[np.ndarray, DeadBugResult]:
         """处理单帧：姿态检测、指标计算、状态机更新、可视化叠加"""
+        total_started = time.perf_counter()
         timestamp_ms = self._normalize_timestamp(timestamp_ms)
+        normalize_done = time.perf_counter()
 
         with self._result_lock:
             if self._inflight and time.monotonic() - self._inflight_started_at > 0.5:
@@ -90,9 +93,11 @@ class DeadBugDetector:
             if should_submit:
                 self._inflight = True
                 self._inflight_started_at = time.monotonic()
+        lock_done = time.perf_counter()
 
         if should_submit:
             rgba_frame = bgr_to_rgba(bgr_frame)
+            convert_done = time.perf_counter()
             mp_image = mp.Image(image_format=mp.ImageFormat.SRGBA, data=rgba_frame)
             self._result_event.clear()
             try:
@@ -102,12 +107,20 @@ class DeadBugDetector:
                     self._inflight = False
                     self._inflight_started_at = 0.0
                 raise
+            submit_done = time.perf_counter()
             self._result_event.wait(timeout=0.08)
+            wait_done = time.perf_counter()
+        else:
+            convert_done = lock_done
+            submit_done = lock_done
+            wait_done = lock_done
 
         with self._result_lock:
             pose_result = self._latest_result
+        result_read_done = time.perf_counter()
 
         annotated = bgr_frame.copy()
+        copy_done = time.perf_counter()
 
         if pose_result is None or not pose_result.pose_landmarks:
             self._state.mark_no_pose()
@@ -120,6 +133,18 @@ class DeadBugDetector:
                 metrics=None,
             )
             draw_status_overlay(annotated, result)
+            self._record_timing(
+                total_started,
+                normalize_done,
+                lock_done,
+                convert_done,
+                submit_done,
+                wait_done,
+                result_read_done,
+                copy_done,
+                time.perf_counter(),
+                should_submit,
+            )
             return annotated, result
 
         landmarks = [Point(lm.x, lm.y, lm.z, getattr(lm, "visibility", 1.0)) for lm in pose_result.pose_landmarks[0]]
@@ -136,6 +161,18 @@ class DeadBugDetector:
                 metrics=None,
             )
             draw_status_overlay(annotated, result)
+            self._record_timing(
+                total_started,
+                normalize_done,
+                lock_done,
+                convert_done,
+                submit_done,
+                wait_done,
+                result_read_done,
+                copy_done,
+                time.perf_counter(),
+                should_submit,
+            )
             return annotated, result
 
         raw = calculate_metrics(
@@ -169,8 +206,46 @@ class DeadBugDetector:
 
         result = self._state.update(metrics)
         draw_status_overlay(annotated, result)
+        self._record_timing(
+            total_started,
+            normalize_done,
+            lock_done,
+            convert_done,
+            submit_done,
+            wait_done,
+            result_read_done,
+            copy_done,
+            time.perf_counter(),
+            should_submit,
+        )
         return annotated, result
 
+    def _record_timing(
+        self,
+        total_started: float,
+        normalize_done: float,
+        lock_done: float,
+        convert_done: float,
+        submit_done: float,
+        wait_done: float,
+        result_read_done: float,
+        copy_done: float,
+        finished: float,
+        submitted: bool,
+    ) -> None:
+        self.last_timing = {
+            "total_ms": (finished - total_started) * 1000,
+            "timestamp_ms": (normalize_done - total_started) * 1000,
+            "lock_ms": (lock_done - normalize_done) * 1000,
+            "convert_ms": (convert_done - lock_done) * 1000,
+            "submit_ms": (submit_done - convert_done) * 1000,
+            "wait_ms": (wait_done - submit_done) * 1000,
+            "result_read_ms": (result_read_done - wait_done) * 1000,
+            "copy_ms": (copy_done - result_read_done) * 1000,
+            "postprocess_draw_ms": (finished - copy_done) * 1000,
+            "submitted": submitted,
+        }
+
     def _normalize_timestamp(self, timestamp_ms: int) -> int:
         """确保时间戳严格递增（MediaPipe要求）"""
         if timestamp_ms <= self._last_timestamp_ms:
diff --git a/app/vision/pose_landmarker.py b/app/vision/pose_landmarker.py
index c831b69..998222d 100644
--- a/app/vision/pose_landmarker.py
+++ b/app/vision/pose_landmarker.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import platform
 from typing import Callable
 
 import mediapipe as mp
@@ -27,6 +28,11 @@ class PoseLandmarkerWrapper:
 
         if prefer_gpu:
             try:
+                if platform.system() == "Windows":
+                    logger.warning(
+                        "MediaPipe GPU delegate requested, but MediaPipe Tasks Python does not support GPU delegate on Windows; "
+                        "Intel Iris Xe cannot be used by this backend and CPU fallback is expected"
+                    )
                 self.delegate = BaseOptions.Delegate.GPU
                 self._landmarker = self._create(self.delegate, result_callback)
                 logger.info("MediaPipe PoseLandmarker initialized with GPU delegate")
diff --git a/app/webrtc/video_receiver.py b/app/webrtc/video_receiver.py
index ce32b10..53832b3 100644
--- a/app/webrtc/video_receiver.py
+++ b/app/webrtc/video_receiver.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import time
 
 import cv2
 from aiortc.mediastreams import MediaStreamError
@@ -25,6 +26,39 @@ def _format_pose_debug(pose_result) -> str:
         f"ll={metrics.left_leg_extended}, rl={metrics.right_leg_extended})"
     )
 
+
+def _new_perf_window() -> dict:
+    return {
+        "frames": 0,
+        "processed": 0,
+        "loop_ms": 0.0,
+        "to_ndarray_ms": 0.0,
+        "detect_ms": 0.0,
+        "show_ms": 0.0,
+        "max_loop_ms": 0.0,
+        "max_detect_ms": 0.0,
+        "detector": {},
+    }
+
+
+def _add_detector_timing(perf: dict, timing: dict[str, float | bool]) -> None:
+    detector = perf["detector"]
+    for key, value in timing.items():
+        if key == "submitted":
+            detector[key] = detector.get(key, 0) + (1 if value else 0)
+            continue
+        value = float(value)
+        detector[key] = detector.get(key, 0.0) + value
+        max_key = f"max_{key}"
+        detector[max_key] = max(detector.get(max_key, 0.0), value)
+
+
+def _avg(perf: dict, key: str, denominator: int) -> float:
+    if denominator <= 0:
+        return 0.0
+    return perf.get(key, 0.0) / denominator
+
+
 class VideoReceiver:
     """视频轨道接收与运动检测流水线"""
 
@@ -34,10 +68,19 @@ class VideoReceiver:
     async def run(self) -> None:
         """持续接收视频帧并进行姿态检测、渲染和语音播报"""
         log_every_n_frames = max(1, config.video.log_every_n_frames)
+        perf_log_every_n_frames = max(1, config.video.perf_log_every_n_frames)
+        slow_frame_ms = max(0.0, config.video.slow_frame_ms)
         logger.info(
-            "Start receiving video frames, process_every_n={}, log_every_n={}",
+            "Start receiving video frames, process_every_n={}, log_every_n={}, perf_log_every_n={}, slow_frame_ms={}",
             config.video.process_every_n_frames,
             log_every_n_frames,
+            perf_log_every_n_frames,
+            slow_frame_ms,
+        )
+        logger.info(
+            "OpenCV OpenCL status: have_opencl={}, use_opencl={}",
+            cv2.ocl.haveOpenCL(),
+            cv2.ocl.useOpenCL(),
         )
 
         frame_count = 0
@@ -57,23 +100,43 @@ class VideoReceiver:
         last_announced_rep = 0
         last_pose_result = None
         last_annotated = None
+        perf = _new_perf_window()
 
         try:
             while True:
+                loop_started = time.perf_counter()
                 frame = await self._track.recv()
                 frame_count += 1
+                recv_done = time.perf_counter()
                 raw_img = frame.to_ndarray(format="bgr24")
+                ndarray_done = time.perf_counter()
                 timestamp_ms = int(frame.time * 1000) if frame.time is not None else frame_count * 33
 
+                detect_ms = 0.0
                 if frame_count % config.video.process_every_n_frames == 0 or last_pose_result is None:
+                    detect_started = time.perf_counter()
                     processed_count += 1
                     last_annotated, last_pose_result = detector.process_frame(raw_img, timestamp_ms)
+                    detect_ms = (time.perf_counter() - detect_started) * 1000
+                    perf["processed"] += 1
+                    perf["detect_ms"] += detect_ms
+                    perf["max_detect_ms"] = max(perf["max_detect_ms"], detect_ms)
+                    _add_detector_timing(perf, detector.last_timing)
                     if last_pose_result.rep_count > last_announced_rep:
                         last_announced_rep = last_pose_result.rep_count
+                        announce_started = time.perf_counter()
                         announcer.announce_count(last_announced_rep)
+                        logger.info(
+                            "Rep completed and audio requested: count={}, frame={}, announce_call_ms={:.1f}",
+                            last_announced_rep,
+                            frame_count,
+                            (time.perf_counter() - announce_started) * 1000,
+                        )
 
                 display_img = last_annotated if last_annotated is not None else raw_img
+                show_started = time.perf_counter()
                 show_frame(display_img)
+                show_done = time.perf_counter()
 
                 if frame_count % log_every_n_frames == 0:
                     logger.info(
@@ -87,6 +150,53 @@ class VideoReceiver:
                         _format_pose_debug(last_pose_result) if last_pose_result is not None else "metrics=None",
                     )
 
+                loop_ms = (show_done - loop_started) * 1000
+                to_ndarray_ms = (ndarray_done - recv_done) * 1000
+                show_ms = (show_done - show_started) * 1000
+                perf["frames"] += 1
+                perf["loop_ms"] += loop_ms
+                perf["to_ndarray_ms"] += to_ndarray_ms
+                perf["show_ms"] += show_ms
+                perf["max_loop_ms"] = max(perf["max_loop_ms"], loop_ms)
+
+                if slow_frame_ms and loop_ms >= slow_frame_ms:
+                    logger.warning(
+                        "Slow video frame: frame={}, loop_ms={:.1f}, detect_ms={:.1f}, to_ndarray_ms={:.1f}, show_ms={:.1f}, shape={}",
+                        frame_count,
+                        loop_ms,
+                        detect_ms,
+                        to_ndarray_ms,
+                        show_ms,
+                        raw_img.shape,
+                    )
+
+                if frame_count % perf_log_every_n_frames == 0:
+                    frames = perf["frames"]
+                    processed = perf["processed"]
+                    detector_perf = perf["detector"]
+                    logger.info(
+                        "Perf window: frames={}, processed={}, avg_loop_ms={:.1f}, max_loop_ms={:.1f}, avg_to_ndarray_ms={:.1f}, "
+                        "avg_detect_ms={:.1f}, max_detect_ms={:.1f}, avg_show_ms={:.1f}, detector_avg_total_ms={:.1f}, "
+                        "detector_max_total_ms={:.1f}, detector_avg_wait_ms={:.1f}, detector_max_wait_ms={:.1f}, "
+                        "detector_avg_convert_ms={:.1f}, detector_avg_postprocess_draw_ms={:.1f}, detector_submitted={}",
+                        frames,
+                        processed,
+                        _avg(perf, "loop_ms", frames),
+                        perf["max_loop_ms"],
+                        _avg(perf, "to_ndarray_ms", frames),
+                        _avg(perf, "detect_ms", processed),
+                        perf["max_detect_ms"],
+                        _avg(perf, "show_ms", frames),
+                        _avg(detector_perf, "total_ms", processed),
+                        detector_perf.get("max_total_ms", 0.0),
+                        _avg(detector_perf, "wait_ms", processed),
+                        detector_perf.get("max_wait_ms", 0.0),
+                        _avg(detector_perf, "convert_ms", processed),
+                        _avg(detector_perf, "postprocess_draw_ms", processed),
+                        detector_perf.get("submitted", 0),
+                    )
+                    perf = _new_perf_window()
+
                 if is_esc_pressed():
                     logger.info("ESC pressed, closing display")
                     break
diff --git a/config.yaml b/config.yaml
index 3732501..20e4f6d 100644
--- a/config.yaml
+++ b/config.yaml
@@ -8,7 +8,9 @@ server:
 
 video:
   process_every_n_frames: 2
-  log_every_n_frames: 2
+  log_every_n_frames: 30
+  perf_log_every_n_frames: 30
+  slow_frame_ms: 100
 
 model:
   path: "./pose_models/pose_landmarker_full.task"
@@ -25,6 +27,9 @@ audio:
   rep_announcer_volume: 1.0
   rep_max_count: 200              # 预生成语音文件的最大次数
   rep_audio_dir: ""               # 空 = 自动使用 app/audio/reps
+  trim_leading_silence: true
+  trim_silence_threshold: 500
+  trim_silence_padding_ms: 20
 
 logging:
   dir: logs
diff --git a/configs/models.py b/configs/models.py
index a460d63..2038878 100644
--- a/configs/models.py
+++ b/configs/models.py
@@ -16,7 +16,9 @@ class ServerConfig:
 class VideoConfig:
     """视频帧处理配置"""
     process_every_n_frames: int = 2
-    log_every_n_frames: int = 2
+    log_every_n_frames: int = 30
+    perf_log_every_n_frames: int = 30
+    slow_frame_ms: float = 100.0
 
 
 @dataclass
@@ -49,6 +51,9 @@ class AudioConfig:
     rep_announcer_volume: float = 1.0
     rep_max_count: int = 200
     rep_audio_dir: str = ""
+    trim_leading_silence: bool = True
+    trim_silence_threshold: int = 500
+    trim_silence_padding_ms: int = 20
 
     @property
     def resolved_audio_dir(self) -> Path: