将这段语音输出为文本,将WAV文件按指定的时长进行分割

audio2text.py

def audio_to_text(audio_path) :
    # 检查 GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 加载模型
    processor = AutoProcessor.from_pretrained("E:/model_cache/QwenQwen2-Audio-7B-Instruct")
    model = Qwen2AudioForConditionalGeneration.from_pretrained("E:/model_cache/QwenQwen2-Audio-7B-Instruct",
                                                               device_map="auto")

    # 移动模型到 GPU
    model.to(device)

    # 处理音频
    audio_data, sr = librosa.load(audio_path, sr=16000)  # 明确设置采样率

    # 规范化 conversation
    conversation = [
        {"role": "user", "content": "<|AUDIO|>"},
        {"role": "assistant", "content": "将这段语音输出为文本,直接输出文本内容即可,不要输出多余的话"},
    ]

    # 生成文本输入
    text = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

    # 生成输入
    inputs = processor(text=text, audios=[audio_data], return_tensors="pt", padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # 移动到 GPU

    # **使用 max_new_tokens 代替 max_length**
    response_ids = model.generate(**inputs, max_new_tokens=512)

    # 解析结果
    response = \
    processor.tokenizer.batch_decode(response_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    # print(response)
    print("--------------------------------------------")
    # 正则表达式查找第二个 'assistant' 后面的内容
    match = re.search(r'(?<=assistant)(.*)', response.split('assistant', 2)[-1], re.DOTALL)

    if match:
        # 输出第二个 'assistant' 后面的内容
        # print(match.group(1).strip())
        # 使用 replace 方法将 "assistant" 替换为空
        cleaned_text = match.group(1).strip().replace("这段音频的原始内容是：", "")
        cleaned_text = cleaned_text.replace("这段语音的原始文本内容是：", "")
        print(cleaned_text)
        return cleaned_text
    else:
        print("未找到匹配的内容")
        return ""

splite.py

def split_audio(input_wav, segment_length=30, output_dir="output"):
    """
    将WAV文件按指定的时长进行分割（默认30秒）。
    :param input_wav: 输入的WAV文件路径
    :param segment_length: 每段的时长，单位为秒（默认30秒）
    :param output_dir: 分割后的文件保存目录（默认"output"）
    """
    # 载入音频文件
    audio = AudioSegment.from_wav(input_wav)

    # 计算每段的毫秒数
    segment_ms = segment_length * 1000

    # 创建输出目录（如果不存在）
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 获取音频的总时长（毫秒）
    audio_length = len(audio)

    # 根据音频时长进行分割
    count = 0
    for i in range(0, audio_length, segment_ms):
        # 截取当前的音频段
        segment = audio[i:i + segment_ms]

        # 生成输出文件路径
        output_file = os.path.join(output_dir, f"{count}.wav")

        # 导出音频片段为WAV文件
        segment.export(output_file, format="wav")
        print(f"保存: {output_file}")
        count += 1

if __name__ == "__main__":
    # 调用函数进行分割
    input_wav = "C:/Users/LMT/Desktop/my.WAV"  # 输入的WAV文件路径
    split_audio(input_wav,output_dir="E:/voice")
    # 遍历输出目录下的所有WAV文件，进行语音识别
    count = 0
    text = ""
    while True:
        file_name = "E:/voice/" + str(count) + ".wav"
        if os.path.exists(file_name):
            text += audio_to_text(file_name)
            os.remove(file_name)
            count += 1
        else:
            break
    print("============================")
    print(text)

转载请注明：SuperIT » 将这段语音输出为文本,将WAV文件按指定的时长进行分割