perf(video): 优化视频处理性能监控和音频播放

- 添加视频处理性能计时和统计功能
- 实现帧处理时间监控和慢帧警告
- 添加音频文件静音修剪功能
- 优化Windows平台音频播放实现
- 调整默认日志输出频率减少冗余信息
- 修复MediaPipe GPU委托在Windows上的兼容性问题
This commit is contained in:
2026-06-15 23:13:36 +08:00
parent 6dee2a2ff3
commit 08b6543b79
8 changed files with 387 additions and 48 deletions
+118 -20
View File
@@ -4,6 +4,7 @@ from __future__ import annotations
import platform
import shutil
import subprocess
import wave
from pathlib import Path
from loguru import logger
@@ -15,6 +16,9 @@ def generate_rep_audio_files(
rate: int,
output_dir: Path,
overwrite: bool = False,
trim_leading_silence: bool = True,
trim_silence_threshold: int = 500,
trim_silence_padding_ms: int = 20,
) -> None:
"""
确保 0~max_count 的运动次数语音 wav 文件存在。
@@ -40,29 +44,37 @@ def generate_rep_audio_files(
if not missing_counts:
logger.info("Rep audio files already prepared: {}", output_dir)
return
logger.info(
"Preparing rep audio files, system={}, count={}, output_dir={}",
system,
len(missing_counts),
output_dir,
)
if system == "darwin":
_generate_with_macos_say(
counts=missing_counts,
output_dir=output_dir,
rate=rate,
)
else:
_generate_with_pyttsx3(
counts=missing_counts,
output_dir=output_dir,
rate=rate,
logger.info(
"Preparing rep audio files, system={}, count={}, output_dir={}",
system,
len(missing_counts),
output_dir,
)
logger.info("Rep audio files prepared: {}", output_dir)
if system == "darwin":
_generate_with_macos_say(
counts=missing_counts,
output_dir=output_dir,
rate=rate,
)
else:
_generate_with_pyttsx3(
counts=missing_counts,
output_dir=output_dir,
rate=rate,
)
logger.info("Rep audio files prepared: {}", output_dir)
if trim_leading_silence and suffix == ".wav":
_trim_leading_silence_files(
counts=list(range(0, max_count + 1)),
output_dir=output_dir,
suffix=suffix,
threshold=trim_silence_threshold,
padding_ms=trim_silence_padding_ms,
)
def _generate_with_macos_say(
@@ -127,3 +139,89 @@ def _generate_with_pyttsx3(
def _audio_path(output_dir: Path, count: int, *, suffix: str) -> Path:
return output_dir / f"{count}{suffix}"
def _trim_leading_silence_files(
*,
counts: list[int],
output_dir: Path,
suffix: str,
threshold: int,
padding_ms: int,
) -> None:
trimmed = 0
total_removed_ms = 0.0
for count in counts:
audio_file = _audio_path(output_dir, count, suffix=suffix)
if not audio_file.exists():
continue
removed_ms = _trim_leading_silence(audio_file, threshold=threshold, padding_ms=padding_ms)
if removed_ms > 0:
trimmed += 1
total_removed_ms += removed_ms
logger.info(
"Rep audio leading silence trim complete: files_trimmed={}, total_removed_ms={:.1f}, threshold={}, padding_ms={}",
trimmed,
total_removed_ms,
threshold,
padding_ms,
)
def _trim_leading_silence(audio_file: Path, *, threshold: int, padding_ms: int) -> float:
with wave.open(str(audio_file), "rb") as reader:
params = reader.getparams()
frames = reader.readframes(params.nframes)
frame_size = params.sampwidth * params.nchannels
if params.nframes <= 0 or frame_size <= 0:
return 0.0
chunk_frames = max(1, params.framerate // 100)
leading_frames = 0
offset = 0
chunk_size = chunk_frames * frame_size
while offset < len(frames):
chunk = frames[offset : offset + chunk_size]
if _pcm_rms(chunk, params.sampwidth) > threshold:
break
chunk_frame_count = len(chunk) // frame_size
leading_frames += chunk_frame_count
offset += chunk_size
padding_frames = int(params.framerate * max(0, padding_ms) / 1000)
remove_frames = max(0, leading_frames - padding_frames)
if remove_frames <= 0:
return 0.0
start = min(len(frames), remove_frames * frame_size)
trimmed_frames = frames[start:]
if not trimmed_frames:
return 0.0
with wave.open(str(audio_file), "wb") as writer:
writer.setparams(params)
writer.writeframes(trimmed_frames)
return remove_frames / params.framerate * 1000
def _pcm_rms(chunk: bytes, sample_width: int) -> float:
if not chunk:
return 0.0
if sample_width == 2:
sample_count = len(chunk) // 2
if sample_count == 0:
return 0.0
total = 0
for i in range(0, sample_count * 2, 2):
sample = int.from_bytes(chunk[i : i + 2], "little", signed=True)
total += sample * sample
return (total / sample_count) ** 0.5
peak = max(abs(byte - 128) for byte in chunk)
return float(peak)
+62 -25
View File
@@ -5,6 +5,7 @@ import shutil
import subprocess
import sys
import threading
import time
from pathlib import Path
from loguru import logger
@@ -24,12 +25,14 @@ class RepAnnouncer:
self.max_count = max_count
self.audio_dir = Path(audio_dir)
self._queue: queue.Queue[int | None] = queue.Queue()
self._queue: queue.Queue[tuple[int, float] | None] = queue.Queue()
self._thread: threading.Thread | None = None
self._current_process: subprocess.Popen | None = None
self._closed = False
self._play_lock = threading.Lock()
self._platform = sys.platform
self._direct_playback = self._platform.startswith("win")
if self.enabled:
self._start()
@@ -41,8 +44,26 @@ class RepAnnouncer:
if count <= 0 or count > self.max_count:
return
requested_at = time.perf_counter()
if self._direct_playback:
audio_file = self._audio_path(count)
if not audio_file.exists():
logger.warning("Rep audio file missing: {}", audio_file)
return
try:
self._play(audio_file)
logger.info(
"Rep audio submitted immediately: count={}, submit_ms={:.1f}",
count,
(time.perf_counter() - requested_at) * 1000,
)
except Exception as exc:
logger.warning("Failed to play rep count {}: {}", count, exc)
return
self._clear_pending_counts()
self._queue.put(count)
self._queue.put((count, requested_at))
logger.info("Rep audio queued: count={}", count)
def close(self) -> None:
"""停止播报线程并释放资源"""
@@ -62,17 +83,24 @@ class RepAnnouncer:
"""启动后台播报线程"""
self.audio_dir.mkdir(parents=True, exist_ok=True)
if self._direct_playback:
import winsound
logger.info("Rep announcer initialized in direct Windows mode, audio_dir={}", self.audio_dir)
return
self._thread = threading.Thread(target=self._run, name="RepAnnouncer", daemon=True)
self._thread.start()
logger.info("Rep announcer initialized, audio_dir={}", self.audio_dir)
logger.info("Rep announcer initialized in queued mode, audio_dir={}", self.audio_dir)
def _run(self) -> None:
"""后台线程:从队列取次数,播放对应音频文件"""
while True:
count = self._queue.get()
if count is None:
item = self._queue.get()
if item is None:
return
count, requested_at = item
audio_file = self._audio_path(count)
if not audio_file.exists():
@@ -81,33 +109,42 @@ class RepAnnouncer:
try:
self._play(audio_file)
logger.info(
"Rep audio submitted from queue: count={}, queue_ms={:.1f}",
count,
(time.perf_counter() - requested_at) * 1000,
)
except Exception as exc:
logger.warning("Failed to play rep count {}: {}", count, exc)
def _play(self, audio_file: Path) -> None:
"""播放音频文件(平台自适应)"""
self._stop_current_playback()
with self._play_lock:
self._stop_current_playback()
if self._platform == "darwin":
self._current_process = subprocess.Popen(
["afplay", str(audio_file)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
elif self._platform.startswith("win"):
import winsound
if self._platform == "darwin":
self._current_process = subprocess.Popen(
["afplay", str(audio_file)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
elif self._platform.startswith("win"):
import winsound
winsound.PlaySound(str(audio_file), winsound.SND_FILENAME | winsound.SND_ASYNC)
else:
player = shutil.which("paplay") or shutil.which("aplay")
if player is None:
logger.warning("No audio player found")
return
self._current_process = subprocess.Popen(
[player, str(audio_file)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
winsound.PlaySound(
str(audio_file),
winsound.SND_FILENAME | winsound.SND_ASYNC | winsound.SND_NODEFAULT,
)
else:
player = shutil.which("paplay") or shutil.which("aplay")
if player is None:
logger.warning("No audio player found")
return
self._current_process = subprocess.Popen(
[player, str(audio_file)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
def _stop_current_playback(self) -> None:
"""中断当前正在播放的声音"""
+3
View File
@@ -16,4 +16,7 @@ def startup() -> None:
rate=config.audio.rep_announcer_rate,
output_dir=config.audio.resolved_audio_dir,
overwrite=False,
trim_leading_silence=config.audio.trim_leading_silence,
trim_silence_threshold=config.audio.trim_silence_threshold,
trim_silence_padding_ms=config.audio.trim_silence_padding_ms,
)
+75
View File
@@ -52,6 +52,7 @@ class DeadBugDetector:
self._result_event = threading.Event()
self._inflight = False
self._inflight_started_at = 0.0
self.last_timing: dict[str, float | bool] = {}
def on_result(pose_result, _image, _timestamp_ms):
with self._result_lock:
@@ -79,7 +80,9 @@ class DeadBugDetector:
def process_frame(self, bgr_frame: np.ndarray, timestamp_ms: int) -> tuple[np.ndarray, DeadBugResult]:
"""处理单帧:姿态检测、指标计算、状态机更新、可视化叠加"""
total_started = time.perf_counter()
timestamp_ms = self._normalize_timestamp(timestamp_ms)
normalize_done = time.perf_counter()
with self._result_lock:
if self._inflight and time.monotonic() - self._inflight_started_at > 0.5:
@@ -90,9 +93,11 @@ class DeadBugDetector:
if should_submit:
self._inflight = True
self._inflight_started_at = time.monotonic()
lock_done = time.perf_counter()
if should_submit:
rgba_frame = bgr_to_rgba(bgr_frame)
convert_done = time.perf_counter()
mp_image = mp.Image(image_format=mp.ImageFormat.SRGBA, data=rgba_frame)
self._result_event.clear()
try:
@@ -102,12 +107,20 @@ class DeadBugDetector:
self._inflight = False
self._inflight_started_at = 0.0
raise
submit_done = time.perf_counter()
self._result_event.wait(timeout=0.08)
wait_done = time.perf_counter()
else:
convert_done = lock_done
submit_done = lock_done
wait_done = lock_done
with self._result_lock:
pose_result = self._latest_result
result_read_done = time.perf_counter()
annotated = bgr_frame.copy()
copy_done = time.perf_counter()
if pose_result is None or not pose_result.pose_landmarks:
self._state.mark_no_pose()
@@ -120,6 +133,18 @@ class DeadBugDetector:
metrics=None,
)
draw_status_overlay(annotated, result)
self._record_timing(
total_started,
normalize_done,
lock_done,
convert_done,
submit_done,
wait_done,
result_read_done,
copy_done,
time.perf_counter(),
should_submit,
)
return annotated, result
landmarks = [Point(lm.x, lm.y, lm.z, getattr(lm, "visibility", 1.0)) for lm in pose_result.pose_landmarks[0]]
@@ -136,6 +161,18 @@ class DeadBugDetector:
metrics=None,
)
draw_status_overlay(annotated, result)
self._record_timing(
total_started,
normalize_done,
lock_done,
convert_done,
submit_done,
wait_done,
result_read_done,
copy_done,
time.perf_counter(),
should_submit,
)
return annotated, result
raw = calculate_metrics(
@@ -169,8 +206,46 @@ class DeadBugDetector:
result = self._state.update(metrics)
draw_status_overlay(annotated, result)
self._record_timing(
total_started,
normalize_done,
lock_done,
convert_done,
submit_done,
wait_done,
result_read_done,
copy_done,
time.perf_counter(),
should_submit,
)
return annotated, result
def _record_timing(
self,
total_started: float,
normalize_done: float,
lock_done: float,
convert_done: float,
submit_done: float,
wait_done: float,
result_read_done: float,
copy_done: float,
finished: float,
submitted: bool,
) -> None:
self.last_timing = {
"total_ms": (finished - total_started) * 1000,
"timestamp_ms": (normalize_done - total_started) * 1000,
"lock_ms": (lock_done - normalize_done) * 1000,
"convert_ms": (convert_done - lock_done) * 1000,
"submit_ms": (submit_done - convert_done) * 1000,
"wait_ms": (wait_done - submit_done) * 1000,
"result_read_ms": (result_read_done - wait_done) * 1000,
"copy_ms": (copy_done - result_read_done) * 1000,
"postprocess_draw_ms": (finished - copy_done) * 1000,
"submitted": submitted,
}
def _normalize_timestamp(self, timestamp_ms: int) -> int:
"""确保时间戳严格递增(MediaPipe要求)"""
if timestamp_ms <= self._last_timestamp_ms:
+6
View File
@@ -1,5 +1,6 @@
from __future__ import annotations
import platform
from typing import Callable
import mediapipe as mp
@@ -27,6 +28,11 @@ class PoseLandmarkerWrapper:
if prefer_gpu:
try:
if platform.system() == "Windows":
logger.warning(
"MediaPipe GPU delegate requested, but MediaPipe Tasks Python does not support GPU delegate on Windows; "
"Intel Iris Xe cannot be used by this backend and CPU fallback is expected"
)
self.delegate = BaseOptions.Delegate.GPU
self._landmarker = self._create(self.delegate, result_callback)
logger.info("MediaPipe PoseLandmarker initialized with GPU delegate")
+111 -1
View File
@@ -1,6 +1,7 @@
from __future__ import annotations
import asyncio
import time
import cv2
from aiortc.mediastreams import MediaStreamError
@@ -25,6 +26,39 @@ def _format_pose_debug(pose_result) -> str:
f"ll={metrics.left_leg_extended}, rl={metrics.right_leg_extended})"
)
def _new_perf_window() -> dict:
return {
"frames": 0,
"processed": 0,
"loop_ms": 0.0,
"to_ndarray_ms": 0.0,
"detect_ms": 0.0,
"show_ms": 0.0,
"max_loop_ms": 0.0,
"max_detect_ms": 0.0,
"detector": {},
}
def _add_detector_timing(perf: dict, timing: dict[str, float | bool]) -> None:
detector = perf["detector"]
for key, value in timing.items():
if key == "submitted":
detector[key] = detector.get(key, 0) + (1 if value else 0)
continue
value = float(value)
detector[key] = detector.get(key, 0.0) + value
max_key = f"max_{key}"
detector[max_key] = max(detector.get(max_key, 0.0), value)
def _avg(perf: dict, key: str, denominator: int) -> float:
if denominator <= 0:
return 0.0
return perf.get(key, 0.0) / denominator
class VideoReceiver:
"""视频轨道接收与运动检测流水线"""
@@ -34,10 +68,19 @@ class VideoReceiver:
async def run(self) -> None:
"""持续接收视频帧并进行姿态检测、渲染和语音播报"""
log_every_n_frames = max(1, config.video.log_every_n_frames)
perf_log_every_n_frames = max(1, config.video.perf_log_every_n_frames)
slow_frame_ms = max(0.0, config.video.slow_frame_ms)
logger.info(
"Start receiving video frames, process_every_n={}, log_every_n={}",
"Start receiving video frames, process_every_n={}, log_every_n={}, perf_log_every_n={}, slow_frame_ms={}",
config.video.process_every_n_frames,
log_every_n_frames,
perf_log_every_n_frames,
slow_frame_ms,
)
logger.info(
"OpenCV OpenCL status: have_opencl={}, use_opencl={}",
cv2.ocl.haveOpenCL(),
cv2.ocl.useOpenCL(),
)
frame_count = 0
@@ -57,23 +100,43 @@ class VideoReceiver:
last_announced_rep = 0
last_pose_result = None
last_annotated = None
perf = _new_perf_window()
try:
while True:
loop_started = time.perf_counter()
frame = await self._track.recv()
frame_count += 1
recv_done = time.perf_counter()
raw_img = frame.to_ndarray(format="bgr24")
ndarray_done = time.perf_counter()
timestamp_ms = int(frame.time * 1000) if frame.time is not None else frame_count * 33
detect_ms = 0.0
if frame_count % config.video.process_every_n_frames == 0 or last_pose_result is None:
detect_started = time.perf_counter()
processed_count += 1
last_annotated, last_pose_result = detector.process_frame(raw_img, timestamp_ms)
detect_ms = (time.perf_counter() - detect_started) * 1000
perf["processed"] += 1
perf["detect_ms"] += detect_ms
perf["max_detect_ms"] = max(perf["max_detect_ms"], detect_ms)
_add_detector_timing(perf, detector.last_timing)
if last_pose_result.rep_count > last_announced_rep:
last_announced_rep = last_pose_result.rep_count
announce_started = time.perf_counter()
announcer.announce_count(last_announced_rep)
logger.info(
"Rep completed and audio requested: count={}, frame={}, announce_call_ms={:.1f}",
last_announced_rep,
frame_count,
(time.perf_counter() - announce_started) * 1000,
)
display_img = last_annotated if last_annotated is not None else raw_img
show_started = time.perf_counter()
show_frame(display_img)
show_done = time.perf_counter()
if frame_count % log_every_n_frames == 0:
logger.info(
@@ -87,6 +150,53 @@ class VideoReceiver:
_format_pose_debug(last_pose_result) if last_pose_result is not None else "metrics=None",
)
loop_ms = (show_done - loop_started) * 1000
to_ndarray_ms = (ndarray_done - recv_done) * 1000
show_ms = (show_done - show_started) * 1000
perf["frames"] += 1
perf["loop_ms"] += loop_ms
perf["to_ndarray_ms"] += to_ndarray_ms
perf["show_ms"] += show_ms
perf["max_loop_ms"] = max(perf["max_loop_ms"], loop_ms)
if slow_frame_ms and loop_ms >= slow_frame_ms:
logger.warning(
"Slow video frame: frame={}, loop_ms={:.1f}, detect_ms={:.1f}, to_ndarray_ms={:.1f}, show_ms={:.1f}, shape={}",
frame_count,
loop_ms,
detect_ms,
to_ndarray_ms,
show_ms,
raw_img.shape,
)
if frame_count % perf_log_every_n_frames == 0:
frames = perf["frames"]
processed = perf["processed"]
detector_perf = perf["detector"]
logger.info(
"Perf window: frames={}, processed={}, avg_loop_ms={:.1f}, max_loop_ms={:.1f}, avg_to_ndarray_ms={:.1f}, "
"avg_detect_ms={:.1f}, max_detect_ms={:.1f}, avg_show_ms={:.1f}, detector_avg_total_ms={:.1f}, "
"detector_max_total_ms={:.1f}, detector_avg_wait_ms={:.1f}, detector_max_wait_ms={:.1f}, "
"detector_avg_convert_ms={:.1f}, detector_avg_postprocess_draw_ms={:.1f}, detector_submitted={}",
frames,
processed,
_avg(perf, "loop_ms", frames),
perf["max_loop_ms"],
_avg(perf, "to_ndarray_ms", frames),
_avg(perf, "detect_ms", processed),
perf["max_detect_ms"],
_avg(perf, "show_ms", frames),
_avg(detector_perf, "total_ms", processed),
detector_perf.get("max_total_ms", 0.0),
_avg(detector_perf, "wait_ms", processed),
detector_perf.get("max_wait_ms", 0.0),
_avg(detector_perf, "convert_ms", processed),
_avg(detector_perf, "postprocess_draw_ms", processed),
detector_perf.get("submitted", 0),
)
perf = _new_perf_window()
if is_esc_pressed():
logger.info("ESC pressed, closing display")
break
+6 -1
View File
@@ -8,7 +8,9 @@ server:
video:
process_every_n_frames: 2
log_every_n_frames: 2
log_every_n_frames: 30
perf_log_every_n_frames: 30
slow_frame_ms: 100
model:
path: "./pose_models/pose_landmarker_full.task"
@@ -25,6 +27,9 @@ audio:
rep_announcer_volume: 1.0
rep_max_count: 200 # 预生成语音文件的最大次数
rep_audio_dir: "" # 空 = 自动使用 app/audio/reps
trim_leading_silence: true
trim_silence_threshold: 500
trim_silence_padding_ms: 20
logging:
dir: logs
+6 -1
View File
@@ -16,7 +16,9 @@ class ServerConfig:
class VideoConfig:
"""视频帧处理配置"""
process_every_n_frames: int = 2
log_every_n_frames: int = 2
log_every_n_frames: int = 30
perf_log_every_n_frames: int = 30
slow_frame_ms: float = 100.0
@dataclass
@@ -49,6 +51,9 @@ class AudioConfig:
rep_announcer_volume: float = 1.0
rep_max_count: int = 200
rep_audio_dir: str = ""
trim_leading_silence: bool = True
trim_silence_threshold: int = 500
trim_silence_padding_ms: int = 20
@property
def resolved_audio_dir(self) -> Path: