# backend/session_manager.py import json import random import time import uuid from typing import List, Dict, Any, Optional from .models import Clip, get_display_model_name class SessionManager: """Manages evaluation sessions, responses, and export logic.""" def __init__(self, data_manager): self.data_manager = data_manager self.sessions: Dict[str, Dict[str, Any]] = {} self.responses: Dict[str, List[Dict[str, Any]]] = { "mos": [], "ab": [], "feedback": [], } # -------------------------- # Session creation # -------------------------- def create_session(self) -> Dict[str, Any]: session_id = str(uuid.uuid4()) clips = self.data_manager.load_clips() rng = random.Random(time.time()) mos_clips: List[Clip] = [] models = {clip.model for clip in clips} # Build MOS clip set for model in models: model_clips = [clip for clip in clips if clip.model == model] exercise_groups: Dict[str, Dict[str, List[Clip]]] = {} for clip in model_clips: if clip.exercise_id not in exercise_groups: exercise_groups[clip.exercise_id] = {"male": [], "female": []} exercise_groups[clip.exercise_id][clip.speaker].append(clip) # Collect all clips from this model all_model_clips = [] for _, speakers in exercise_groups.items(): if speakers["male"]: all_model_clips.extend(speakers["male"]) if speakers["female"]: all_model_clips.extend(speakers["female"]) # Select 3 random clips (regardless of gender pairing) for this model selected_clips = rng.sample(all_model_clips, min(3, len(all_model_clips))) mos_clips.extend(selected_clips) # Group by content (exercise + transcript) for comparisons content_groups: Dict[Any, List[Clip]] = {} for clip in clips: key = (clip.exercise, clip.exercise_id, clip.transcript) content_groups.setdefault(key, []).append(clip) # --- Model vs Model (same gender, same exercise) --- ab_model_pairs = [] # Get all unique exercises all_exercises = list({key[1] for key in content_groups.keys()}) rng.shuffle(all_exercises) max_pairs = 6 for exercise_id in all_exercises: # Find all content groups for this exercise matching_keys = [k for k in content_groups if k[1] == exercise_id] if not matching_keys: continue # Pick a random content group for this exercise key = rng.choice(matching_keys) group = content_groups[key] # Group by model and speaker model_speaker_map: Dict[str, Dict[str, List[Clip]]] = {} for clip in group: model_speaker_map.setdefault(clip.model, {}).setdefault(clip.speaker, []).append(clip) model_names = list(model_speaker_map.keys()) if len(model_names) < 2: continue # Try to find a random valid gender for this exercise valid_genders = [s for s in ["male", "female"] if sum(1 for m in model_names if s in model_speaker_map[m] and model_speaker_map[m][s]) >= 2] if not valid_genders: continue speaker = rng.choice(valid_genders) available_models = [model for model in model_names if speaker in model_speaker_map[model] and model_speaker_map[model][speaker]] if len(available_models) < 2: continue model_a, model_b = rng.sample(available_models, 2) clip_a = rng.choice(model_speaker_map[model_a][speaker]) clip_b = rng.choice(model_speaker_map[model_b][speaker]) ab_model_pairs.append((clip_a, clip_b)) if len(ab_model_pairs) >= max_pairs: break # --- Gender vs Gender (same model, same exercise) --- ab_gender_pairs = [] rng.shuffle(all_exercises) for exercise_id in all_exercises: matching_keys = [k for k in content_groups if k[1] == exercise_id] if not matching_keys: continue key = rng.choice(matching_keys) group = content_groups[key] # Group by model and gender model_gender_groups: Dict[str, Dict[str, List[Clip]]] = {} for clip in group: model_gender_groups.setdefault(clip.model, {}).setdefault(clip.speaker, []).append(clip) valid_models = [m for m, genders in model_gender_groups.items() if "male" in genders and "female" in genders and genders["male"] and genders["female"]] if not valid_models: continue model = rng.choice(valid_models) gender_groups = model_gender_groups[model] clip_male = rng.choice(gender_groups["male"]) clip_female = rng.choice(gender_groups["female"]) ab_gender_pairs.append((clip_male, clip_female)) if len(ab_gender_pairs) >= max_pairs: break session_data: Dict[str, Any] = { "session_id": session_id, "created_at": time.time(), "mos_clips": mos_clips, "ab_model_pairs": ab_model_pairs, "ab_gender_pairs": ab_gender_pairs, "completed": False, } self.sessions[session_id] = session_data return session_data # -------------------------- # Response storage helpers # -------------------------- def save_response(self, response_type: str, response: Dict[str, Any]): """Generic low-level append with auto-timestamp.""" if "timestamp" not in response: response["timestamp"] = time.time() self.responses.setdefault(response_type, []).append(response) def save_mos_rating( self, session: Dict[str, Any], clip_id: str, model: str, clarity: Optional[int], pronunciation: Optional[int], prosody: Optional[int], naturalness: Optional[int], overall: Optional[int], comment: str, gender_mismatch: bool, ) -> None: """Optional helper for saving a single MOS rating.""" if not session: return mos_response = { "session_id": session["session_id"], "clip_id": clip_id, "clarity": int(clarity) if clarity is not None else None, "pronunciation": int(pronunciation) if pronunciation is not None else None, "prosody": int(prosody) if prosody is not None else None, "naturalness": int(naturalness) if naturalness is not None else None, "overall": int(overall) if overall is not None else None, "comment": comment or "", "gender_mismatch": bool(gender_mismatch), "timestamp": time.time(), } self.save_response("mos", mos_response) def save_ab_rating( self, session: Dict[str, Any], clip_a_id: str, clip_b_id: str, comparison_type: str, choice: str, comment: str, gender_mismatch_a: bool, gender_mismatch_b: bool, ) -> None: """Optional helper for saving a single A/B comparison.""" if not session: return ab_response = { "session_id": session["session_id"], "clip_a_id": clip_a_id, "clip_b_id": clip_b_id, "comparison_type": comparison_type, "choice": choice, "comment": comment or "", "gender_mismatch_a": bool(gender_mismatch_a), "gender_mismatch_b": bool(gender_mismatch_b), "timestamp": time.time(), } self.save_response("ab", ab_response) # -------------------------- # Bulk processing from JS JSON # -------------------------- def process_mos_data( self, session: Dict[str, Any], mos_data_json: str, ) -> None: """ Take the JSON string from the hidden MOS textbox and turn it into individual MOS responses in self.responses["mos"]. """ print(f"[DEBUG] process_mos_data called with JSON: '{mos_data_json}'") print(f"[DEBUG] Session ID: {session.get('session_id') if session else 'None'}") if not session or not mos_data_json: print(f"[DEBUG] Skipping MOS processing - session: {session is not None}, data length: {len(mos_data_json) if mos_data_json else 0}") return try: ratings_data = json.loads(mos_data_json) if mos_data_json else {} except json.JSONDecodeError as e: print(f"[WARN] Failed to parse MOS data JSON: {e}") return try: # Get all clips that were presented to the user presented_clips = session.get("mos_clips", []) presented_clip_ids = {clip.id for clip in presented_clips} print(f"[DEBUG] Presented {len(presented_clip_ids)} MOS clips to user") print(f"[DEBUG] Received ratings for {len(ratings_data)} clips") # Process all presented clips, whether rated or not for clip in presented_clips: clip_id = clip.id ratings = ratings_data.get(clip_id, {}) mos_response = { "session_id": session["session_id"], "clip_id": clip_id, "clarity": int(ratings.get("clarity")) if ratings.get("clarity") else None, "pronunciation": int(ratings.get("pronunciation")) if ratings.get("pronunciation") else None, "prosody": int(ratings.get("prosody")) if ratings.get("prosody") else None, "naturalness": int(ratings.get("naturalness")) if ratings.get("naturalness") else None, "overall": int(ratings.get("overall")) if ratings.get("overall") else None, "comment": ratings.get("comment", ""), "gender_mismatch": ratings.get("gender_mismatch", False), "timestamp": time.time(), } self.save_response("mos", mos_response) # Log whether this clip was rated or not has_ratings = any( ratings.get(dim) for dim in ["clarity", "pronunciation", "prosody", "naturalness", "overall"] ) status = "rated" if has_ratings else "not rated" print(f"[INFO] Processed MOS clip {clip_id} ({status})") except Exception as e: print(f"[WARN] Error processing MOS data: {e}") def process_ab_data( self, session: Dict[str, Any], ab_data_json: str, ) -> None: """ Take the JSON string from the hidden AB textbox and turn it into individual A/B responses in self.responses["ab"]. """ print(f"[DEBUG] process_ab_data called with JSON: '{ab_data_json}'") print(f"[DEBUG] Session ID: {session.get('session_id') if session else 'None'}") if not session or not ab_data_json: print(f"[DEBUG] Skipping AB processing - session: {session is not None}, data length: {len(ab_data_json) if ab_data_json else 0}") return try: comparisons_data = json.loads(ab_data_json) if ab_data_json else {} except json.JSONDecodeError as e: print(f"[WARN] Failed to parse A/B data JSON: {e}") return try: print(f"[DEBUG] Received ratings for {len(comparisons_data)} comparisons") # Build a set of already saved comparison IDs to avoid duplicates session_id = session["session_id"] existing_ab_responses = [ r for r in self.responses.get("ab", []) if r.get("session_id") == session_id ] existing_pairs = { (r["clip_a_id"], r["clip_b_id"]) for r in existing_ab_responses } print(f"[DEBUG] Already have {len(existing_pairs)} AB comparisons saved for this session") # Collect ALL comparison types present in the data comparison_types = set() for comp_data in comparisons_data.values(): if comp_data.get("comparison_type"): comparison_types.add(comp_data["comparison_type"]) print(f"[DEBUG] Found comparison types in data: {comparison_types}") # Process each comparison type separately all_presented_pairs = [] for comparison_type in comparison_types: if comparison_type == "model_vs_model": pairs = session.get("ab_model_pairs", []) print(f"[DEBUG] Processing model-vs-model pairs: {len(pairs)} pairs presented") all_presented_pairs.extend([(clip_a, clip_b, comparison_type) for clip_a, clip_b in pairs]) elif comparison_type == "gender_vs_gender": pairs = session.get("ab_gender_pairs", []) print(f"[DEBUG] Processing gender-vs-gender pairs: {len(pairs)} pairs presented") all_presented_pairs.extend([(clip_a, clip_b, comparison_type) for clip_a, clip_b in pairs]) else: print(f"[WARN] Unknown comparison type: {comparison_type}") # Process all presented pairs for clip_a, clip_b, comparison_type in all_presented_pairs: clip_a_id = clip_a.id clip_b_id = clip_b.id # Skip if we've already saved this pair if (clip_a_id, clip_b_id) in existing_pairs: print(f"[DEBUG] Skipping duplicate comparison: {clip_a_id} vs {clip_b_id}") continue # Find user's rating for this pair from the submitted data # JS sends numeric keys ("1", "2", etc.), so search by clip IDs comparison = {} for comp_data in comparisons_data.values(): if comp_data.get("clip_a_id") == clip_a_id and comp_data.get("clip_b_id") == clip_b_id: comparison = comp_data break ab_response = { "session_id": session_id, "clip_a_id": clip_a_id, "clip_b_id": clip_b_id, "comparison_type": comparison_type, "choice": comparison.get("choice"), # Can be None if not rated "comment": comparison.get("comment", ""), # Support both model_vs_model (gender_mismatch_a/b) # and gender_vs_gender (gender_mismatch_male/female) "gender_mismatch_a": comparison.get("gender_mismatch_a", False) or comparison.get("gender_mismatch_male", False), "gender_mismatch_b": comparison.get("gender_mismatch_b", False) or comparison.get("gender_mismatch_female", False), "timestamp": time.time(), } self.save_response("ab", ab_response) existing_pairs.add((clip_a_id, clip_b_id)) # Mark as saved status = "rated" if comparison.get("choice") else "not rated" print(f"[INFO] Processed A/B comparison {clip_a_id} vs {clip_b_id} ({status})") except Exception as e: print(f"[WARN] Error processing A/B data: {e}") # -------------------------- # Export # -------------------------- def export_session(self, session_id: str) -> Dict[str, Any]: """Build a fully annotated export dict for a given session.""" session = self.sessions.get(session_id, {}) # Create detailed MOS responses with full clip metadata detailed_mos_responses = [] session_mos_clips = {clip.id: clip for clip in session.get("mos_clips", [])} for r in self.responses.get("mos", []): if r.get("session_id") != session_id: continue clip_id = r.get("clip_id") clip = session_mos_clips.get(clip_id) if not clip: continue detailed_response = { # Session metadata "session_id": session_id, "response_timestamp": r.get("timestamp", time.time()), # Full clip metadata "clip_id": clip_id, "exercise": clip.exercise, "exercise_id": clip.exercise_id, "transcript": clip.transcript, "model": clip.model, # Original model name "display_model": get_display_model_name( clip.model ), # Anonymized name "speaker": clip.speaker, # MOS ratings "clarity": r.get("clarity"), "pronunciation": r.get("pronunciation"), "prosody": r.get("prosody"), "naturalness": r.get("naturalness"), "overall": r.get("overall"), "comment": r.get("comment", ""), # Quality control flags "gender_mismatch": r.get( "gender_mismatch", False ), # True if user flagged wrong gender # Response type "evaluation_type": "mos_rating", } detailed_mos_responses.append(detailed_response) # Create detailed A/B responses with full clip metadata detailed_ab_responses = [] session_ab_model_pairs = session.get("ab_model_pairs", []) session_ab_gender_pairs = session.get("ab_gender_pairs", []) for r in self.responses.get("ab", []): if r.get("session_id") != session_id: continue clip_a_id = r.get("clip_a_id") clip_b_id = r.get("clip_b_id") comparison_type = r.get("comparison_type") # Find the clips from session pairs clip_a, clip_b = None, None if comparison_type == "model_vs_model": for pair_a, pair_b in session_ab_model_pairs: if pair_a.id == clip_a_id and pair_b.id == clip_b_id: clip_a, clip_b = pair_a, pair_b break elif comparison_type == "gender_vs_gender": for pair_a, pair_b in session_ab_gender_pairs: if pair_a.id == clip_a_id and pair_b.id == clip_b_id: clip_a, clip_b = pair_a, pair_b break if not (clip_a and clip_b): continue detailed_response = { # Session metadata "session_id": session_id, "response_timestamp": r.get("timestamp", time.time()), # Comparison metadata "comparison_type": comparison_type, "choice": r.get("choice"), "comment": r.get("comment", ""), # Clip A metadata "clip_a_id": clip_a.id, "clip_a_exercise": clip_a.exercise, "clip_a_exercise_id": clip_a.exercise_id, "clip_a_transcript": clip_a.transcript, "clip_a_model": clip_a.model, "clip_a_display_model": get_display_model_name(clip_a.model), "clip_a_speaker": clip_a.speaker, # Clip B metadata "clip_b_id": clip_b.id, "clip_b_exercise": clip_b.exercise, "clip_b_exercise_id": clip_b.exercise_id, "clip_b_transcript": clip_b.transcript, "clip_b_model": clip_b.model, "clip_b_display_model": get_display_model_name(clip_b.model), "clip_b_speaker": clip_b.speaker, # Quality control flags "gender_mismatch_a": r.get( "gender_mismatch_a", False ), # True if clip A has wrong gender "gender_mismatch_b": r.get( "gender_mismatch_b", False ), # True if clip B has wrong gender # Response type "evaluation_type": "ab_comparison", } detailed_ab_responses.append(detailed_response) return { "session_metadata": { "session_id": session_id, "created_at": session.get("created_at"), "completed": session.get("completed", False), "exported_at": time.time(), "total_mos_ratings": len(detailed_mos_responses), "total_ab_comparisons": len(detailed_ab_responses), }, "mos_ratings": detailed_mos_responses, "ab_comparisons": detailed_ab_responses, "overall_feedback": [ r for r in self.responses.get("feedback", []) if r.get("session_id") == session_id ], }