From 08b6543b792161e4606069726823a43dbce62234 Mon Sep 17 00:00:00 2001 From: wsy182 <2392948297@qq.com> Date: Mon, 15 Jun 2026 23:13:36 +0800 Subject: [PATCH] =?UTF-8?q?perf(video):=20=E4=BC=98=E5=8C=96=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E5=A4=84=E7=90=86=E6=80=A7=E8=83=BD=E7=9B=91=E6=8E=A7?= =?UTF-8?q?=E5=92=8C=E9=9F=B3=E9=A2=91=E6=92=AD=E6=94=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加视频处理性能计时和统计功能 - 实现帧处理时间监控和慢帧警告 - 添加音频文件静音修剪功能 - 优化Windows平台音频播放实现 - 调整默认日志输出频率减少冗余信息 - 修复MediaPipe GPU委托在Windows上的兼容性问题 --- app/audio/generate.py | 138 ++++++++++++++++++++++++----- app/audio/rep_announcer.py | 87 ++++++++++++------ app/core/lifecycle.py | 3 + app/exercises/dead_bug/detector.py | 75 ++++++++++++++++ app/vision/pose_landmarker.py | 6 ++ app/webrtc/video_receiver.py | 112 ++++++++++++++++++++++- config.yaml | 7 +- configs/models.py | 7 +- 8 files changed, 387 insertions(+), 48 deletions(-) diff --git a/app/audio/generate.py b/app/audio/generate.py index e13ed73..1b68e9f 100644 --- a/app/audio/generate.py +++ b/app/audio/generate.py @@ -4,6 +4,7 @@ from __future__ import annotations import platform import shutil import subprocess +import wave from pathlib import Path from loguru import logger @@ -15,6 +16,9 @@ def generate_rep_audio_files( rate: int, output_dir: Path, overwrite: bool = False, + trim_leading_silence: bool = True, + trim_silence_threshold: int = 500, + trim_silence_padding_ms: int = 20, ) -> None: """ 确保 0~max_count 的运动次数语音 wav 文件存在。 @@ -40,29 +44,37 @@ def generate_rep_audio_files( if not missing_counts: logger.info("Rep audio files already prepared: {}", output_dir) - return - - logger.info( - "Preparing rep audio files, system={}, count={}, output_dir={}", - system, - len(missing_counts), - output_dir, - ) - - if system == "darwin": - _generate_with_macos_say( - counts=missing_counts, - output_dir=output_dir, - rate=rate, - ) else: - _generate_with_pyttsx3( - counts=missing_counts, - output_dir=output_dir, - rate=rate, + logger.info( + "Preparing rep audio files, system={}, count={}, output_dir={}", + system, + len(missing_counts), + output_dir, ) - logger.info("Rep audio files prepared: {}", output_dir) + if system == "darwin": + _generate_with_macos_say( + counts=missing_counts, + output_dir=output_dir, + rate=rate, + ) + else: + _generate_with_pyttsx3( + counts=missing_counts, + output_dir=output_dir, + rate=rate, + ) + + logger.info("Rep audio files prepared: {}", output_dir) + + if trim_leading_silence and suffix == ".wav": + _trim_leading_silence_files( + counts=list(range(0, max_count + 1)), + output_dir=output_dir, + suffix=suffix, + threshold=trim_silence_threshold, + padding_ms=trim_silence_padding_ms, + ) def _generate_with_macos_say( @@ -127,3 +139,89 @@ def _generate_with_pyttsx3( def _audio_path(output_dir: Path, count: int, *, suffix: str) -> Path: return output_dir / f"{count}{suffix}" + + +def _trim_leading_silence_files( + *, + counts: list[int], + output_dir: Path, + suffix: str, + threshold: int, + padding_ms: int, +) -> None: + trimmed = 0 + total_removed_ms = 0.0 + + for count in counts: + audio_file = _audio_path(output_dir, count, suffix=suffix) + if not audio_file.exists(): + continue + removed_ms = _trim_leading_silence(audio_file, threshold=threshold, padding_ms=padding_ms) + if removed_ms > 0: + trimmed += 1 + total_removed_ms += removed_ms + + logger.info( + "Rep audio leading silence trim complete: files_trimmed={}, total_removed_ms={:.1f}, threshold={}, padding_ms={}", + trimmed, + total_removed_ms, + threshold, + padding_ms, + ) + + +def _trim_leading_silence(audio_file: Path, *, threshold: int, padding_ms: int) -> float: + with wave.open(str(audio_file), "rb") as reader: + params = reader.getparams() + frames = reader.readframes(params.nframes) + + frame_size = params.sampwidth * params.nchannels + if params.nframes <= 0 or frame_size <= 0: + return 0.0 + + chunk_frames = max(1, params.framerate // 100) + leading_frames = 0 + offset = 0 + chunk_size = chunk_frames * frame_size + + while offset < len(frames): + chunk = frames[offset : offset + chunk_size] + if _pcm_rms(chunk, params.sampwidth) > threshold: + break + chunk_frame_count = len(chunk) // frame_size + leading_frames += chunk_frame_count + offset += chunk_size + + padding_frames = int(params.framerate * max(0, padding_ms) / 1000) + remove_frames = max(0, leading_frames - padding_frames) + if remove_frames <= 0: + return 0.0 + + start = min(len(frames), remove_frames * frame_size) + trimmed_frames = frames[start:] + if not trimmed_frames: + return 0.0 + + with wave.open(str(audio_file), "wb") as writer: + writer.setparams(params) + writer.writeframes(trimmed_frames) + + return remove_frames / params.framerate * 1000 + + +def _pcm_rms(chunk: bytes, sample_width: int) -> float: + if not chunk: + return 0.0 + + if sample_width == 2: + sample_count = len(chunk) // 2 + if sample_count == 0: + return 0.0 + total = 0 + for i in range(0, sample_count * 2, 2): + sample = int.from_bytes(chunk[i : i + 2], "little", signed=True) + total += sample * sample + return (total / sample_count) ** 0.5 + + peak = max(abs(byte - 128) for byte in chunk) + return float(peak) diff --git a/app/audio/rep_announcer.py b/app/audio/rep_announcer.py index 41a68fa..4580cb7 100644 --- a/app/audio/rep_announcer.py +++ b/app/audio/rep_announcer.py @@ -5,6 +5,7 @@ import shutil import subprocess import sys import threading +import time from pathlib import Path from loguru import logger @@ -24,12 +25,14 @@ class RepAnnouncer: self.max_count = max_count self.audio_dir = Path(audio_dir) - self._queue: queue.Queue[int | None] = queue.Queue() + self._queue: queue.Queue[tuple[int, float] | None] = queue.Queue() self._thread: threading.Thread | None = None self._current_process: subprocess.Popen | None = None self._closed = False + self._play_lock = threading.Lock() self._platform = sys.platform + self._direct_playback = self._platform.startswith("win") if self.enabled: self._start() @@ -41,8 +44,26 @@ class RepAnnouncer: if count <= 0 or count > self.max_count: return + requested_at = time.perf_counter() + if self._direct_playback: + audio_file = self._audio_path(count) + if not audio_file.exists(): + logger.warning("Rep audio file missing: {}", audio_file) + return + try: + self._play(audio_file) + logger.info( + "Rep audio submitted immediately: count={}, submit_ms={:.1f}", + count, + (time.perf_counter() - requested_at) * 1000, + ) + except Exception as exc: + logger.warning("Failed to play rep count {}: {}", count, exc) + return + self._clear_pending_counts() - self._queue.put(count) + self._queue.put((count, requested_at)) + logger.info("Rep audio queued: count={}", count) def close(self) -> None: """停止播报线程并释放资源""" @@ -62,17 +83,24 @@ class RepAnnouncer: """启动后台播报线程""" self.audio_dir.mkdir(parents=True, exist_ok=True) + if self._direct_playback: + import winsound + + logger.info("Rep announcer initialized in direct Windows mode, audio_dir={}", self.audio_dir) + return + self._thread = threading.Thread(target=self._run, name="RepAnnouncer", daemon=True) self._thread.start() - logger.info("Rep announcer initialized, audio_dir={}", self.audio_dir) + logger.info("Rep announcer initialized in queued mode, audio_dir={}", self.audio_dir) def _run(self) -> None: """后台线程:从队列取次数,播放对应音频文件""" while True: - count = self._queue.get() - if count is None: + item = self._queue.get() + if item is None: return + count, requested_at = item audio_file = self._audio_path(count) if not audio_file.exists(): @@ -81,33 +109,42 @@ class RepAnnouncer: try: self._play(audio_file) + logger.info( + "Rep audio submitted from queue: count={}, queue_ms={:.1f}", + count, + (time.perf_counter() - requested_at) * 1000, + ) except Exception as exc: logger.warning("Failed to play rep count {}: {}", count, exc) def _play(self, audio_file: Path) -> None: """播放音频文件(平台自适应)""" - self._stop_current_playback() + with self._play_lock: + self._stop_current_playback() - if self._platform == "darwin": - self._current_process = subprocess.Popen( - ["afplay", str(audio_file)], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - elif self._platform.startswith("win"): - import winsound + if self._platform == "darwin": + self._current_process = subprocess.Popen( + ["afplay", str(audio_file)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + elif self._platform.startswith("win"): + import winsound - winsound.PlaySound(str(audio_file), winsound.SND_FILENAME | winsound.SND_ASYNC) - else: - player = shutil.which("paplay") or shutil.which("aplay") - if player is None: - logger.warning("No audio player found") - return - self._current_process = subprocess.Popen( - [player, str(audio_file)], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) + winsound.PlaySound( + str(audio_file), + winsound.SND_FILENAME | winsound.SND_ASYNC | winsound.SND_NODEFAULT, + ) + else: + player = shutil.which("paplay") or shutil.which("aplay") + if player is None: + logger.warning("No audio player found") + return + self._current_process = subprocess.Popen( + [player, str(audio_file)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) def _stop_current_playback(self) -> None: """中断当前正在播放的声音""" diff --git a/app/core/lifecycle.py b/app/core/lifecycle.py index 4edfa3b..be2829f 100644 --- a/app/core/lifecycle.py +++ b/app/core/lifecycle.py @@ -16,4 +16,7 @@ def startup() -> None: rate=config.audio.rep_announcer_rate, output_dir=config.audio.resolved_audio_dir, overwrite=False, + trim_leading_silence=config.audio.trim_leading_silence, + trim_silence_threshold=config.audio.trim_silence_threshold, + trim_silence_padding_ms=config.audio.trim_silence_padding_ms, ) diff --git a/app/exercises/dead_bug/detector.py b/app/exercises/dead_bug/detector.py index c51f8b1..f95fbdc 100644 --- a/app/exercises/dead_bug/detector.py +++ b/app/exercises/dead_bug/detector.py @@ -52,6 +52,7 @@ class DeadBugDetector: self._result_event = threading.Event() self._inflight = False self._inflight_started_at = 0.0 + self.last_timing: dict[str, float | bool] = {} def on_result(pose_result, _image, _timestamp_ms): with self._result_lock: @@ -79,7 +80,9 @@ class DeadBugDetector: def process_frame(self, bgr_frame: np.ndarray, timestamp_ms: int) -> tuple[np.ndarray, DeadBugResult]: """处理单帧:姿态检测、指标计算、状态机更新、可视化叠加""" + total_started = time.perf_counter() timestamp_ms = self._normalize_timestamp(timestamp_ms) + normalize_done = time.perf_counter() with self._result_lock: if self._inflight and time.monotonic() - self._inflight_started_at > 0.5: @@ -90,9 +93,11 @@ class DeadBugDetector: if should_submit: self._inflight = True self._inflight_started_at = time.monotonic() + lock_done = time.perf_counter() if should_submit: rgba_frame = bgr_to_rgba(bgr_frame) + convert_done = time.perf_counter() mp_image = mp.Image(image_format=mp.ImageFormat.SRGBA, data=rgba_frame) self._result_event.clear() try: @@ -102,12 +107,20 @@ class DeadBugDetector: self._inflight = False self._inflight_started_at = 0.0 raise + submit_done = time.perf_counter() self._result_event.wait(timeout=0.08) + wait_done = time.perf_counter() + else: + convert_done = lock_done + submit_done = lock_done + wait_done = lock_done with self._result_lock: pose_result = self._latest_result + result_read_done = time.perf_counter() annotated = bgr_frame.copy() + copy_done = time.perf_counter() if pose_result is None or not pose_result.pose_landmarks: self._state.mark_no_pose() @@ -120,6 +133,18 @@ class DeadBugDetector: metrics=None, ) draw_status_overlay(annotated, result) + self._record_timing( + total_started, + normalize_done, + lock_done, + convert_done, + submit_done, + wait_done, + result_read_done, + copy_done, + time.perf_counter(), + should_submit, + ) return annotated, result landmarks = [Point(lm.x, lm.y, lm.z, getattr(lm, "visibility", 1.0)) for lm in pose_result.pose_landmarks[0]] @@ -136,6 +161,18 @@ class DeadBugDetector: metrics=None, ) draw_status_overlay(annotated, result) + self._record_timing( + total_started, + normalize_done, + lock_done, + convert_done, + submit_done, + wait_done, + result_read_done, + copy_done, + time.perf_counter(), + should_submit, + ) return annotated, result raw = calculate_metrics( @@ -169,8 +206,46 @@ class DeadBugDetector: result = self._state.update(metrics) draw_status_overlay(annotated, result) + self._record_timing( + total_started, + normalize_done, + lock_done, + convert_done, + submit_done, + wait_done, + result_read_done, + copy_done, + time.perf_counter(), + should_submit, + ) return annotated, result + def _record_timing( + self, + total_started: float, + normalize_done: float, + lock_done: float, + convert_done: float, + submit_done: float, + wait_done: float, + result_read_done: float, + copy_done: float, + finished: float, + submitted: bool, + ) -> None: + self.last_timing = { + "total_ms": (finished - total_started) * 1000, + "timestamp_ms": (normalize_done - total_started) * 1000, + "lock_ms": (lock_done - normalize_done) * 1000, + "convert_ms": (convert_done - lock_done) * 1000, + "submit_ms": (submit_done - convert_done) * 1000, + "wait_ms": (wait_done - submit_done) * 1000, + "result_read_ms": (result_read_done - wait_done) * 1000, + "copy_ms": (copy_done - result_read_done) * 1000, + "postprocess_draw_ms": (finished - copy_done) * 1000, + "submitted": submitted, + } + def _normalize_timestamp(self, timestamp_ms: int) -> int: """确保时间戳严格递增(MediaPipe要求)""" if timestamp_ms <= self._last_timestamp_ms: diff --git a/app/vision/pose_landmarker.py b/app/vision/pose_landmarker.py index c831b69..998222d 100644 --- a/app/vision/pose_landmarker.py +++ b/app/vision/pose_landmarker.py @@ -1,5 +1,6 @@ from __future__ import annotations +import platform from typing import Callable import mediapipe as mp @@ -27,6 +28,11 @@ class PoseLandmarkerWrapper: if prefer_gpu: try: + if platform.system() == "Windows": + logger.warning( + "MediaPipe GPU delegate requested, but MediaPipe Tasks Python does not support GPU delegate on Windows; " + "Intel Iris Xe cannot be used by this backend and CPU fallback is expected" + ) self.delegate = BaseOptions.Delegate.GPU self._landmarker = self._create(self.delegate, result_callback) logger.info("MediaPipe PoseLandmarker initialized with GPU delegate") diff --git a/app/webrtc/video_receiver.py b/app/webrtc/video_receiver.py index ce32b10..53832b3 100644 --- a/app/webrtc/video_receiver.py +++ b/app/webrtc/video_receiver.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import time import cv2 from aiortc.mediastreams import MediaStreamError @@ -25,6 +26,39 @@ def _format_pose_debug(pose_result) -> str: f"ll={metrics.left_leg_extended}, rl={metrics.right_leg_extended})" ) + +def _new_perf_window() -> dict: + return { + "frames": 0, + "processed": 0, + "loop_ms": 0.0, + "to_ndarray_ms": 0.0, + "detect_ms": 0.0, + "show_ms": 0.0, + "max_loop_ms": 0.0, + "max_detect_ms": 0.0, + "detector": {}, + } + + +def _add_detector_timing(perf: dict, timing: dict[str, float | bool]) -> None: + detector = perf["detector"] + for key, value in timing.items(): + if key == "submitted": + detector[key] = detector.get(key, 0) + (1 if value else 0) + continue + value = float(value) + detector[key] = detector.get(key, 0.0) + value + max_key = f"max_{key}" + detector[max_key] = max(detector.get(max_key, 0.0), value) + + +def _avg(perf: dict, key: str, denominator: int) -> float: + if denominator <= 0: + return 0.0 + return perf.get(key, 0.0) / denominator + + class VideoReceiver: """视频轨道接收与运动检测流水线""" @@ -34,10 +68,19 @@ class VideoReceiver: async def run(self) -> None: """持续接收视频帧并进行姿态检测、渲染和语音播报""" log_every_n_frames = max(1, config.video.log_every_n_frames) + perf_log_every_n_frames = max(1, config.video.perf_log_every_n_frames) + slow_frame_ms = max(0.0, config.video.slow_frame_ms) logger.info( - "Start receiving video frames, process_every_n={}, log_every_n={}", + "Start receiving video frames, process_every_n={}, log_every_n={}, perf_log_every_n={}, slow_frame_ms={}", config.video.process_every_n_frames, log_every_n_frames, + perf_log_every_n_frames, + slow_frame_ms, + ) + logger.info( + "OpenCV OpenCL status: have_opencl={}, use_opencl={}", + cv2.ocl.haveOpenCL(), + cv2.ocl.useOpenCL(), ) frame_count = 0 @@ -57,23 +100,43 @@ class VideoReceiver: last_announced_rep = 0 last_pose_result = None last_annotated = None + perf = _new_perf_window() try: while True: + loop_started = time.perf_counter() frame = await self._track.recv() frame_count += 1 + recv_done = time.perf_counter() raw_img = frame.to_ndarray(format="bgr24") + ndarray_done = time.perf_counter() timestamp_ms = int(frame.time * 1000) if frame.time is not None else frame_count * 33 + detect_ms = 0.0 if frame_count % config.video.process_every_n_frames == 0 or last_pose_result is None: + detect_started = time.perf_counter() processed_count += 1 last_annotated, last_pose_result = detector.process_frame(raw_img, timestamp_ms) + detect_ms = (time.perf_counter() - detect_started) * 1000 + perf["processed"] += 1 + perf["detect_ms"] += detect_ms + perf["max_detect_ms"] = max(perf["max_detect_ms"], detect_ms) + _add_detector_timing(perf, detector.last_timing) if last_pose_result.rep_count > last_announced_rep: last_announced_rep = last_pose_result.rep_count + announce_started = time.perf_counter() announcer.announce_count(last_announced_rep) + logger.info( + "Rep completed and audio requested: count={}, frame={}, announce_call_ms={:.1f}", + last_announced_rep, + frame_count, + (time.perf_counter() - announce_started) * 1000, + ) display_img = last_annotated if last_annotated is not None else raw_img + show_started = time.perf_counter() show_frame(display_img) + show_done = time.perf_counter() if frame_count % log_every_n_frames == 0: logger.info( @@ -87,6 +150,53 @@ class VideoReceiver: _format_pose_debug(last_pose_result) if last_pose_result is not None else "metrics=None", ) + loop_ms = (show_done - loop_started) * 1000 + to_ndarray_ms = (ndarray_done - recv_done) * 1000 + show_ms = (show_done - show_started) * 1000 + perf["frames"] += 1 + perf["loop_ms"] += loop_ms + perf["to_ndarray_ms"] += to_ndarray_ms + perf["show_ms"] += show_ms + perf["max_loop_ms"] = max(perf["max_loop_ms"], loop_ms) + + if slow_frame_ms and loop_ms >= slow_frame_ms: + logger.warning( + "Slow video frame: frame={}, loop_ms={:.1f}, detect_ms={:.1f}, to_ndarray_ms={:.1f}, show_ms={:.1f}, shape={}", + frame_count, + loop_ms, + detect_ms, + to_ndarray_ms, + show_ms, + raw_img.shape, + ) + + if frame_count % perf_log_every_n_frames == 0: + frames = perf["frames"] + processed = perf["processed"] + detector_perf = perf["detector"] + logger.info( + "Perf window: frames={}, processed={}, avg_loop_ms={:.1f}, max_loop_ms={:.1f}, avg_to_ndarray_ms={:.1f}, " + "avg_detect_ms={:.1f}, max_detect_ms={:.1f}, avg_show_ms={:.1f}, detector_avg_total_ms={:.1f}, " + "detector_max_total_ms={:.1f}, detector_avg_wait_ms={:.1f}, detector_max_wait_ms={:.1f}, " + "detector_avg_convert_ms={:.1f}, detector_avg_postprocess_draw_ms={:.1f}, detector_submitted={}", + frames, + processed, + _avg(perf, "loop_ms", frames), + perf["max_loop_ms"], + _avg(perf, "to_ndarray_ms", frames), + _avg(perf, "detect_ms", processed), + perf["max_detect_ms"], + _avg(perf, "show_ms", frames), + _avg(detector_perf, "total_ms", processed), + detector_perf.get("max_total_ms", 0.0), + _avg(detector_perf, "wait_ms", processed), + detector_perf.get("max_wait_ms", 0.0), + _avg(detector_perf, "convert_ms", processed), + _avg(detector_perf, "postprocess_draw_ms", processed), + detector_perf.get("submitted", 0), + ) + perf = _new_perf_window() + if is_esc_pressed(): logger.info("ESC pressed, closing display") break diff --git a/config.yaml b/config.yaml index 3732501..20e4f6d 100644 --- a/config.yaml +++ b/config.yaml @@ -8,7 +8,9 @@ server: video: process_every_n_frames: 2 - log_every_n_frames: 2 + log_every_n_frames: 30 + perf_log_every_n_frames: 30 + slow_frame_ms: 100 model: path: "./pose_models/pose_landmarker_full.task" @@ -25,6 +27,9 @@ audio: rep_announcer_volume: 1.0 rep_max_count: 200 # 预生成语音文件的最大次数 rep_audio_dir: "" # 空 = 自动使用 app/audio/reps + trim_leading_silence: true + trim_silence_threshold: 500 + trim_silence_padding_ms: 20 logging: dir: logs diff --git a/configs/models.py b/configs/models.py index a460d63..2038878 100644 --- a/configs/models.py +++ b/configs/models.py @@ -16,7 +16,9 @@ class ServerConfig: class VideoConfig: """视频帧处理配置""" process_every_n_frames: int = 2 - log_every_n_frames: int = 2 + log_every_n_frames: int = 30 + perf_log_every_n_frames: int = 30 + slow_frame_ms: float = 100.0 @dataclass @@ -49,6 +51,9 @@ class AudioConfig: rep_announcer_volume: float = 1.0 rep_max_count: int = 200 rep_audio_dir: str = "" + trim_leading_silence: bool = True + trim_silence_threshold: int = 500 + trim_silence_padding_ms: int = 20 @property def resolved_audio_dir(self) -> Path: