{"id":739,"date":"2026-02-12T11:22:42","date_gmt":"2026-02-12T03:22:42","guid":{"rendered":"https:\/\/www.bit.ac.cn\/?p=739"},"modified":"2026-02-12T11:29:12","modified_gmt":"2026-02-12T03:29:12","slug":"qwen3-tts","status":"publish","type":"post","link":"https:\/\/www.bit.ac.cn\/index.php\/2026\/02\/12\/qwen3-tts\/","title":{"rendered":"Qwen3-TTS"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\">\u6a21\u578b\u5217\u8868<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">\u6a21\u578b<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u53c2\u6570\u91cf<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u529f\u80fd\u7279\u6027<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u6d41\u5f0f\u652f\u6301<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u6307\u4ee4\u63a7\u5236<\/th><\/tr><\/thead><tbody><tr><td>Qwen3-TTS-12Hz-1.7B-VoiceDesign<\/td><td>1.7B<\/td><td>\u57fa\u4e8e\u63cf\u8ff0\u751f\u6210\u8bed\u97f3<\/td><td>\u2705<\/td><td>\u2705<\/td><\/tr><tr><td>Qwen3-TTS-12Hz-1.7B-CustomVoice<\/td><td>1.7B<\/td><td>9\u79cd\u7cbe\u9009\u97f3\u8272 + \u6307\u4ee4\u63a7\u5236<\/td><td>\u2705<\/td><td>\u2705<\/td><\/tr><tr><td>Qwen3-TTS-12Hz-1.7B-Base<\/td><td>1.7B<\/td><td>3\u79d2\u5feb\u901f\u514b\u9686 + \u5fae\u8c03<\/td><td>\u2705<\/td><td>&#8211;<\/td><\/tr><tr><td>Qwen3-TTS-12Hz-0.6B-CustomVoice<\/td><td>0.6B<\/td><td>9\u79cd\u7cbe\u9009\u97f3\u8272<\/td><td>\u2705<\/td><td>&#8211;<\/td><\/tr><tr><td>Qwen3-TTS-12Hz-0.6B-Base<\/td><td>0.6B<\/td><td>3\u79d2\u5feb\u901f\u514b\u9686 + \u5fae\u8c03<\/td><td>\u2705<\/td><td>&#8211;<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">\u6027\u80fd\u5bf9\u6bd4<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">\u6a21\u578b<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u4e2d\u6587 WER<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u82f1\u6587 WER<\/th><\/tr><\/thead><tbody><tr><td>CosyVoice 3<\/td><td>0.71<\/td><td>1.45<\/td><\/tr><tr><td>MiniMax-Speech<\/td><td>0.83<\/td><td>1.65<\/td><\/tr><tr><td><strong>Qwen3-TTS-12Hz-1.7B-Base<\/strong><\/td><td><strong>0.77<\/strong><\/td><td><strong>1.24<\/strong><\/td><\/tr><tr><td>Qwen3-TTS-12Hz-0.6B-Base<\/td><td>0.92<\/td><td>1.32<\/td><\/tr><tr><td>FireRedTTS 2<\/td><td>1.14<\/td><td>1.95<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">\u73af\u5883\u914d\u7f6e<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code># \u521b\u5efa Python 3.12 \u73af\u5883\nconda create -n qwen3-tts python=3.12 -y\nconda activate qwen3-tts\n\n# \u5b89\u88c5 qwen-tts \u5305\npip install -U qwen-tts\n\n# \u63a8\u8350\u5b89\u88c5 FlashAttention 2 \u4ee5\u51cf\u5c11\u663e\u5b58\u5360\u7528\npip install -U flash-attn --no-build-isolation<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">1. \u81ea\u5b9a\u4e49\u8bed\u97f3\u751f\u6210\uff08CustomVoice\uff09<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>import torch\nimport soundfile as sf\nfrom qwen_tts import Qwen3TTSModel\n\nmodel = Qwen3TTSModel.from_pretrained(\n    \"Qwen\/Qwen3-TTS-12Hz-1.7B-CustomVoice\",\n    device_map=\"cuda:0\",\n    dtype=torch.bfloat16,\n    attn_implementation=\"flash_attention_2\",\n)\n\n# \u5355\u6761\u63a8\u7406\nwavs, sr = model.generate_custom_voice(\n    text=\"\u5176\u5b9e\u6211\u771f\u7684\u6709\u53d1\u73b0\uff0c\u6211\u662f\u4e00\u4e2a\u7279\u522b\u5584\u4e8e\u89c2\u5bdf\u522b\u4eba\u60c5\u7eea\u7684\u4eba\u3002\",\n    language=\"Chinese\",\n    speaker=\"Vivian\",\n    instruct=\"\u7528\u7279\u522b\u6124\u6012\u7684\u8bed\u6c14\u8bf4\",\n)\nsf.write(\"output_custom_voice.wav\", wavs&#91;0], sr)\n\n# \u6279\u91cf\u63a8\u7406\nwavs, sr = model.generate_custom_voice(\n    text=&#91;\n        \"\u5176\u5b9e\u6211\u771f\u7684\u6709\u53d1\u73b0\uff0c\u6211\u662f\u4e00\u4e2a\u7279\u522b\u5584\u4e8e\u89c2\u5bdf\u522b\u4eba\u60c5\u7eea\u7684\u4eba\u3002\", \n        \"She said she would be here by noon.\"\n    ],\n    language=&#91;\"Chinese\", \"English\"],\n    speaker=&#91;\"Vivian\", \"Ryan\"],\n    instruct=&#91;\"\", \"Very happy.\"]\n)<\/code><\/pre>\n\n\n\n<p><strong>\u00a0\u7cbe\u9009\u97f3\u8272<\/strong>\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">\u97f3\u8272<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u63cf\u8ff0<\/th><th class=\"has-text-align-left\" data-align=\"left\">\u6bcd\u8bed<\/th><\/tr><\/thead><tbody><tr><td>Vivian<\/td><td>\u660e\u4eae\u3001\u7565\u5e26\u9510\u5229\u7684\u5e74\u8f7b\u5973\u58f0<\/td><td>\u4e2d\u6587<\/td><\/tr><tr><td>Serena<\/td><td>\u6e29\u6696\u3001\u6e29\u67d4\u7684\u5e74\u8f7b\u5973\u58f0<\/td><td>\u4e2d\u6587<\/td><\/tr><tr><td>Uncle_Fu<\/td><td>\u6210\u719f\u7537\u58f0\uff0c\u4f4e\u6c89\u5706\u6da6<\/td><td>\u4e2d\u6587<\/td><\/tr><tr><td>Dylan<\/td><td>\u5e74\u8f7b\u7684\u5317\u4eac\u7537\u58f0\uff0c\u6e05\u6670\u81ea\u7136<\/td><td>\u4e2d\u6587\uff08\u5317\u4eac\u8bdd\uff09<\/td><\/tr><tr><td>Eric<\/td><td>\u6d3b\u6cfc\u7684\u6210\u90fd\u7537\u58f0\uff0c\u7565\u5e26\u6c99\u54d1<\/td><td>\u4e2d\u6587\uff08\u56db\u5ddd\u8bdd\uff09<\/td><\/tr><tr><td>Ryan<\/td><td>\u52a8\u611f\u7537\u58f0\uff0c\u8282\u594f\u611f\u5f3a<\/td><td>\u82f1\u6587<\/td><\/tr><tr><td>Aiden<\/td><td>\u9633\u5149\u7f8e\u5f0f\u7537\u58f0\uff0c\u4e2d\u97f3\u6e05\u6670<\/td><td>\u82f1\u6587<\/td><\/tr><tr><td>Ono_Anna<\/td><td>\u4fcf\u76ae\u65e5\u672c\u5973\u58f0\uff0c\u8f7b\u76c8\u7075\u52a8<\/td><td>\u65e5\u8bed<\/td><\/tr><tr><td>Sohee<\/td><td>\u6e29\u6696\u97e9\u56fd\u5973\u58f0\uff0c\u60c5\u611f\u4e30\u5bcc<\/td><td>\u97e9\u8bed<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">2. \u8bed\u97f3\u8bbe\u8ba1\uff08VoiceDesign\uff09<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>import torch\nimport soundfile as sf\nfrom qwen_tts import Qwen3TTSModel\n\nmodel = Qwen3TTSModel.from_pretrained(\n    \"Qwen\/Qwen3-TTS-12Hz-1.7B-VoiceDesign\",\n    device_map=\"cuda:0\",\n    dtype=torch.bfloat16,\n    attn_implementation=\"flash_attention_2\",\n)\n\nwavs, sr = model.generate_voice_design(\n    text=\"\u54e5\u54e5\uff0c\u4f60\u56de\u6765\u5566\uff0c\u4eba\u5bb6\u7b49\u4e86\u4f60\u597d\u4e45\u597d\u4e45\u4e86\uff01\",\n    language=\"Chinese\",\n    instruct=\"\u4f53\u73b0\u6492\u5a07\u7a1a\u5ae9\u7684\u841d\u8389\u5973\u58f0\uff0c\u97f3\u8c03\u504f\u9ad8\u4e14\u8d77\u4f0f\u660e\u663e\uff0c\u8425\u9020\u51fa\u9ecf\u4eba\u3001\u505a\u4f5c\u53c8\u523b\u610f\u5356\u840c\u7684\u542c\u89c9\u6548\u679c\u3002\",\n)\nsf.write(\"output_voice_design.wav\", wavs&#91;0], sr)<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">3. \u8bed\u97f3\u514b\u9686\uff08Base\uff09<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code>import torch\nimport soundfile as sf\nfrom qwen_tts import Qwen3TTSModel\n\nmodel = Qwen3TTSModel.from_pretrained(\n    \"Qwen\/Qwen3-TTS-12Hz-1.7B-Base\",\n    device_map=\"cuda:0\",\n    dtype=torch.bfloat16,\n    attn_implementation=\"flash_attention_2\",\n)\n\nref_audio = \"https:\/\/qianwen-res.oss-cn-beijing.aliyuncs.com\/Qwen3-TTS-Repo\/clone.wav\"\nref_text = \"Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you.\"\n\nwavs, sr = model.generate_voice_clone(\n    text=\"I am solving the equation: x = &#91;-b \u00b1 \u221a(b\u00b2-4ac)] \/ 2a? Nobody can \u2014 it's a disaster (\u25cd\u2022\u0348\u2314\u2022\u0348\u25cd), very sad!\",\n    language=\"English\",\n    ref_audio=ref_audio,\n    ref_text=ref_text,\n)\nsf.write(\"output_voice_clone.wav\", wavs&#91;0], sr)<\/code><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\">4. \u542f\u52a8\u672c\u5730 Web UI<\/h2>\n\n\n\n<pre class=\"wp-block-code\"><code># CustomVoice \u6a21\u578b\nqwen-tts-demo Qwen\/Qwen3-TTS-12Hz-1.7B-CustomVoice --ip 0.0.0.0 --port 8000\n\n# VoiceDesign \u6a21\u578b\nqwen-tts-demo Qwen\/Qwen3-TTS-12Hz-1.7B-VoiceDesign --ip 0.0.0.0 --port 8000\n\n# Base \u6a21\u578b\nqwen-tts-demo Qwen\/Qwen3-TTS-12Hz-1.7B-Base --ip 0.0.0.0 --port 8000<\/code><\/pre>\n\n\n\n<p>\u7136\u540e\u8bbf\u95ee http:\/\/:8000 \u5373\u53ef\u4f53\u9a8c\u3002<\/p>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6a21\u578b\u5217\u8868 \u6a21\u578b \u53c2\u6570\u91cf \u529f\u80fd\u7279\u6027 \u6d41\u5f0f\u652f\u6301 \u6307\u4ee4\u63a7\u5236 Qwen3-TTS-12Hz-1.7B-VoiceDesign 1.7B \u57fa\u4e8e\u63cf\u8ff0\u751f\u6210\u8bed\u97f3 \u2705 \u2705 Qwen3-TTS-12Hz-1.7B-CustomVoice 1.7B 9\u79cd\u7cbe\u9009\u97f3\u8272&#8230;<\/p>\n","protected":false},"author":1,"featured_media":68,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[32],"tags":[],"class_list":["post-739","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-github"],"_links":{"self":[{"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/posts\/739","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/comments?post=739"}],"version-history":[{"count":6,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/posts\/739\/revisions"}],"predecessor-version":[{"id":747,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/posts\/739\/revisions\/747"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/media\/68"}],"wp:attachment":[{"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/media?parent=739"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/categories?post=739"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.bit.ac.cn\/index.php\/wp-json\/wp\/v2\/tags?post=739"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}