1. 模型准备
from modelscope import snapshot_downloadmodel_dir = snapshot_download('moonshotai/Kimi-Audio-7B-Instruct', cache_dir="./models")
2. 安装和初步推理
clone仓库需要clone整个子模块
git clone https://github.com/MoonshotAI/Kimi-Audio.gitgit submodule update --init --recursive或者git clone --recursive https://github.com/MoonshotAI/Kimi-Audio.git
安装完依赖后使用python infer.py测试
import soundfile as sf# Assuming the KimiAudio class is available after installationfrom kimia_infer.api.kimia import KimiAudioimport torch # Ensure torch is imported if needed for device placementmodel_path = "/root/xx/models/moonshotai/Kimi-Audio-7B-Instruct" # IMPORTANT: Update this path if loading locallymodel = KimiAudio(model_path=model_path, load_detokenizer=True)model.to(device)print("load model from locally!")# --- 2. Define Sampling Parameters ---sampling_params = { "audio_temperature": 0.8, "audio_top_k": 10, "text_temperature": 0.0, "text_top_k": 5, "audio_repetition_penalty": 1.0, "audio_repetition_window_size": 64, "text_repetition_penalty": 1.0, "text_repetition_window_size": 16,}# --- 3. Example 1: Audio-to-Text (ASR) ---# TODO: Provide actual example audio files or URLs accessible to users# E.g., download sample files first or use URLs# wget https://path/to/your/asr_example.wav -O asr_example.wav# wget https://path/to/your/qa_example.wav -O qa_example.wavasr_audio_path = "test_audios/asr_example.wav" # IMPORTANT: Make sure this file existsqa_audio_path = "test_audios/qa_example.wav" # IMPORTANT: Make sure this file existsmessages_asr = [ {"role": "user", "message_type": "text", "content": "Please transcribe the following audio:"}, {"role": "user", "message_type": "audio", "content": asr_audio_path}]# Generate only text output# Note: Ensure the model object and generate method accept device placement if needed_, text_output = model.generate(messages_asr, **sampling_params, output_type="text")print(">>> ASR Output Text: ", text_output)# Expected output: "这并不是告别,这是一个篇章的结束,也是新篇章的开始。" (Example)# --- 4. Example 2: Audio-to-Audio/Text Conversation ---messages_conversation = [ {"role": "user", "message_type": "audio", "content": qa_audio_path}]# Generate both audio and text outputwav_output, text_output = model.generate(messages_conversation, **sampling_params, output_type="both")# Save the generated audiooutput_audio_path = "output_audio.wav"# Ensure wav_output is on CPU and flattened before savingsf.write(output_audio_path, wav_output.detach().cpu().view(-1).numpy(), 24000) # Assuming 24kHz outputprint(f">>> Conversational Output Audio saved to: {output_audio_path}")print(">>> Conversational Output Text: ", text_output)# Expected output: "A." (Example)print("Kimi-Audio inference examples complete.")