How can I get result with timestamps?

#3
by CodyCoding - opened

generate_kwargs = {
"language": "Japanese",
"no_repeat_ngram_size": 0,
"repetition_penalty": 1.0,
"return_timestamps": True
}

I can't get timestamps from pipeline. Only get text

我尝试根据Qwen/Claude的建议,使用whisper_timestamped库使得此模型能够输出时间轴。就像这样:
I tried to follow the suggestion from Qwen/Claude and used the whisper_timestamped library to make this model capable of outputting timestamps. Like this:

from whisper_timestamped import load_model, transcribe
# 输出带时间戳的结果 # Output results with timestamps
for segment in result['segments']:
print(f"[{segment['start']:.2f}s -> {segment['end']:.2f}s] {segment['text']}")
# 如果需要词级别的时间戳 # If word-level timestamps are required
for word in segment['words']:
print(f" {word['text']}: {word['start']:.2f}s -> {word['end']:.2f}s")

这么做确实可以让模型输出时间轴了。但是模型出现了大量的幻听现象,导致模型几乎不可用。并且我可以确信(经过测试),如果通过正常的方式加载模型,并使用不会出现这么严重的幻听。
Doing so does indeed make the model output timestamps. However, the model started experiencing a significant amount of hallucinations, making it almost unusable. And I can confirm (after testing) that if the model is loaded in the normal way, such severe hallucinations do not occur.
我还在尝试引入VAD,但还没有成功。
I'm still trying to introduce VAD, but I haven't succeeded yet.
我使用LLM进行翻译(Qwen2.5-Max),如有表达不当的地方请务必指出。
I used an LLM for translation (Qwen2.5-Max). If there are any inappropriate expressions, please be sure to point them out.

I'm very into LLM, and here's how I deal with this problem with LLM(Even the code down there were provided by LLM, LoL). It works perfect for me, cheap and fast. Note that I use tiny whisper model to get the first time 'rough' timestamps.

def load_data(json_path, accurate_text_path):
    """Load JSON timestamp data and accurate text file"""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            timing_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: JSON file not found at {json_path}")
        return None, None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON file format at {json_path}")
        return None, None

    try:
        with open(accurate_text_path, 'r', encoding='utf-8') as f:
            accurate_text = f.read()
    except FileNotFoundError:
        print(f"Error: Accurate text file not found at {accurate_text_path}")
        return None, None
        
    return timing_data, accurate_text

def format_timing_data_for_prompt(timing_data):
    """Format JSON data into a string that is easier for LLM to read"""
    formatted_string = ""
    for segment in timing_data.get("segments", []):
        start = segment.get("start")
        end = segment.get("end")
        text = segment.get("text")
        formatted_string += f"  - Time: {start:.3f} - {end:.3f}, Text: \"{text}\"\n"
    return formatted_string.strip()

def align_and_translate(formatted_timings, accurate_text):
    """
    Use the OpenAI API to align and translate the text.
    """
    system_prompt = """
    You are a professional subtitle creation and translation expert. Your task is to create an accurate, timestamped English translation based on two Japanese transcripts.
    One is the "timestamped rough transcript," which provides accurate timing but may have incorrect text.
    The other is the "accurate transcript," which provides accurate text but has no timestamps.

    Your workflow is as follows:
    1. **Alignment**: Use the "timestamped rough transcript" as the base framework and intelligently fill in the accurate text from the "accurate transcript" into the corresponding segments. Note that the sentences and words may not perfectly match, but most of them do, especially the first and last kana in each segment. These are crucial references for alignment. You must align them carefully using context and semantics to ensure the text flows logically, and most importantly, the accurate text matches the timestamps.
    2. **Translation**: Translate the aligned accurate Japanese text into smooth, natural English.
    3. **Output**: You must return the result as a JSON array without any extra explanation. Each JSON object must contain the following fields: 'start', 'end', 'original_text' (aligned accurate Japanese), and 'translated_text' (translated English).
    """

    user_prompt = f"""
    Please process the following data:

    --- Timestamped Rough Transcript ---
    {formatted_timings}
    ------------------------------------

    --- Accurate Transcript ---
    {accurate_text}
    --------------------------

    Please strictly follow the instructions and return only the JSON-formatted output.
    """
    
    print("Calling OpenAI API for processing, please wait...")
    
    try:
        response = client.chat.completions.create(
            # Recommended to use gpt-4.1 or gpt-4o
            model="gpt-4.1", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            # Enable JSON mode to ensure correct output format
            response_format={"type": "json_object"} 
        )
        
        # The API response content is in response.choices[0].message.content
        # It should be a JSON-formatted string
        response_content = response.choices[0].message.content
        
        # Parse this JSON string
        # GPT's returned JSON may have an outer key, so we need to find the array key
        response_json = json.loads(response_content)
        
        # Assume the JSON structure is {"subtitles": [...]} or directly [...]
        # Find the list
        for key, value in response_json.items():
            if isinstance(value, list):
                return value
        
        # If the model directly returned a list (less likely in JSON mode)
        if isinstance(response_json, list):
            return response_json

        print("Error: Could not find subtitle list in API response.")
        return None

    except Exception as e:
        print(f"Error occurred while calling OpenAI API: {e}")
        return None

def to_srt(data):
    """Convert processed data into SRT subtitle format string"""
    srt_content = ""
    for i, item in enumerate(data, 1):
        start_time = item['start']
        end_time = item['end']
        
        # Format timestamp, e.g. 00:01:02,345
        def format_time(s):
            hours, remainder = divmod(s, 3600)
            minutes, seconds = divmod(remainder, 60)
            milliseconds = int((seconds - int(seconds)) * 1000)
            return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"

        start_srt = format_time(start_time)
        end_srt = format_time(end_time)
        
        # You can choose to show only the translation or both original and translation
        original = item['original_text']
        translated = item['translated_text']
        
        srt_content += f"{i}\n"
        srt_content += f"{start_srt} --> {end_srt}\n"
        srt_content += f"{translated}\n" # Show only translation
        # srt_content += f"{translated}\n{original}\n" # If bilingual is needed
        srt_content += "\n"
        
    return srt_content
def load_data(json_path, accurate_text_path):
    """加载JSON时间戳数据和精确的文本文件"""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            timing_data = json.load(f)
    except FileNotFoundError:
        print(f"错误: JSON文件未找到 at {json_path}")
        return None, None
    except json.JSONDecodeError:
        print(f"错误: JSON文件格式不正确 at {json_path}")
        return None, None

    try:
        with open(accurate_text_path, 'r', encoding='utf-8') as f:
            accurate_text = f.read()
    except FileNotFoundError:
        print(f"错误: 精确文本文件未找到 at {accurate_text_path}")
        return None, None
        
    return timing_data, accurate_text

def format_timing_data_for_prompt(timing_data):
    """将JSON数据格式化为更易于LLM阅读的字符串"""
    formatted_string = ""
    for segment in timing_data.get("segments", []):
        start = segment.get("start")
        end = segment.get("end")
        text = segment.get("text")
        formatted_string += f"  - 时间: {start:.3f} - {end:.3f}, 文本: \"{text}\"\n"
    return formatted_string.strip()

def align_and_translate(formatted_timings, accurate_text):
    """
    使用OpenAI API来对齐和翻译文本。
    """
    system_prompt = """
    你是一个专业的字幕制作和翻译专家。你的任务是根据两份日文转录本,完成一个精确的、带时间戳的中文翻译。
    一份是“带时间戳的不精确转录本”,它提供了准确的时间信息但文本内容可能有误。
    另一份是“精确转录本”,它提供了准确的文本内容但没有时间信息。

    你的工作流程如下:
    1.  **对齐**:以“带时间戳的不精确转录本”为基础框架,将“精确转录本”中的文本内容,智能地填充到对应的时间段(segment)中。注意,两者的句子和词语不一定完全匹配,但是大部分是match的,尤其是一个segment里的第一个假名和最后一个假名,这是你匹配的重要参考,千万不能错配,你需要根据语义和上下文进行最佳的对齐,确保最终的文本流畅且符合逻辑,最重要的是,精细文本与粗文本的时间戳是一致的。
    2.  **翻译**:将对齐好的、精确的日文文本内容,逐段翻译成流畅、自然的简体中文。
    3.  **输出**:必须以一个JSON数组的格式返回结果,不包含任何额外的解释。每个JSON对象包含以下字段:'start', 'end', 'original_text' (对齐后的精确日文), 'translated_text' (翻译后的中文)。
    """

    user_prompt = f"""
    请处理以下数据:

    --- 带时间戳的不精确转录本 ---
    {formatted_timings}
    ---------------------------------

    --- 精确转录本 ---
    {accurate_text}
    -------------------

    请严格按照指示,完成对齐和翻译,并仅返回JSON格式的输出。
    """
    
    print("正在调用OpenAI API进行处理,请稍候...")
    
    try:
        response = client.chat.completions.create(
            # 推荐使用 gpt-4.1 或 gpt-4o
            model="gpt-4.1", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            # 开启JSON模式,确保返回格式正确
            response_format={"type": "json_object"} 
        )
        
        # API返回的内容在 response.choices[0].message.content 中
        # 它应该是一个JSON格式的字符串
        response_content = response.choices[0].message.content
        
        # 解析这个JSON字符串
        # GPT返回的JSON可能在外层有一个key,我们需要找到那个包含数组的key
        response_json = json.loads(response_content)
        
        # 假设返回的JSON结构是 {"subtitles": [...]} 或直接是 [...]
        # 我们需要找到那个列表
        for key, value in response_json.items():
            if isinstance(value, list):
                return value
        
        # 如果模型直接返回了一个列表(虽然不太可能在JSON模式下)
        if isinstance(response_json, list):
            return response_json

        print("错误:无法在API响应中找到字幕列表。")
        return None

    except Exception as e:
        print(f"调用OpenAI API时发生错误: {e}")
        return None

def to_srt(data):
    """将处理后的数据转换为SRT字幕格式的字符串"""
    srt_content = ""
    for i, item in enumerate(data, 1):
        start_time = item['start']
        end_time = item['end']
        
        # 格式化时间戳,例如 00:01:02,345
        def format_time(s):
            hours, remainder = divmod(s, 3600)
            minutes, seconds = divmod(remainder, 60)
            milliseconds = int((seconds - int(seconds)) * 1000)
            return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"

        start_srt = format_time(start_time)
        end_srt = format_time(end_time)
        
        # 你可以选择只放中文,或者中日双语
        original = item['original_text']
        translated = item['translated_text']
        
        srt_content += f"{i}\n"
        srt_content += f"{start_srt} --> {end_srt}\n"
        srt_content += f"{translated}\n" # 只显示中文
        # srt_content += f"{translated}\n{original}\n" # 如果需要双语
        srt_content += "\n"
        
    return srt_content

Sign up or log in to comment