Windows 环境下 llama.cpp 编译 + Qwen 模型本地部署全指南

梭净挟 发表于 2026-1-10 23:05:04

在大模型落地场景中，本地轻量化部署因低延迟、高隐私性、无需依赖云端算力等优势，成为开发者与 AI 爱好者的热门需求。本文聚焦 Windows 10/11（64 位）环境，详细拆解 llama.cpp 工具的编译流程（支持 CPU/GPU 双模式，GPU 加速需依赖 NVIDIA CUDA），并指导如何通过 modelscope 下载 GGUF 格式的 Qwen-7B-Chat 模型，最终实现模型本地启动与 API 服务搭建。
1.打开管理员权限的 PowerShell/CMD，执行以下命令克隆代码：
git clone https://github.com/ggml-org/llama.cpp
mkdir build
cd build
2.基础编译（仅 CPU 支持）或者选用GPU 加速编译（已安装 CUDA Toolkit）
如果只使用CPU则执行如下配置

cmake .. -G "Visual Studio 18 2026" -A x64 -DLLAMA_CURL=OFF
cmake --build . --config Release

如果已安装 CUDA Toolkit，添加 -DLLAMA_CUDA=ON 开启 GPU 支持

cmake .. -G "Visual Studio 18 2026" -A x64 -DLLAMA_CUDA=ON
cmake --build . --config Release
3、下载 GGUF 格式的 Qwen 模型（以 7B 为例）
https://www.modelscope.cn/models

pip install modelscope
modelscope download --model Xorbits/Qwen-7B-Chat-GGUF
下载后的保存位置为 \modelscope\hub\models\Xorbits
4、运行模型启动 API 服务（支持 HTTP 调用）
# 命令行启动
chcp 65001
llama-cli.exe -m qwen.gguf -i -c 4096

# CPU 版
llama-server.exe -m qwen.gguf --host 127.0.0.1 --port 11433 -c 4096

# GPU 加速版
llama-server.exe -m qwen-7b-chat.Q4_0.gguf -c 4096 --n-gpu-layers -1
5、服务启动后默认监听 http://localhost:8080，可通过 curl 测试调用效果。
curl http://localhost:8080/completion -H "Content-Type: application/json" -d '{
"prompt": "你好，介绍一下通义千问",
"temperature": 0.7,
"max_tokens": 512
}'
6、工具测试，通过代码调用大模型测试效果。
基础非流式调用（completion 端点）
import requests
import json

url = "http://localhost:8080/completion"
headers = {"Content-Type": "application/json"}
data = {
"model": "qwen.gguf",
"prompt": "你好，请用100字介绍一下通义千问",
"temperature": 0.7,# 回答随机性（越低越保守）
"max_tokens": 512,# 最大生成token数
"ctx_size": 4096,# 上下文窗口（与服务启动时一致）
"stop": ["<|im_end|>"]# 停止符（适配Qwen的对话格式）
}

try:
response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
response.raise_for_status()
result = response.json()

print("生成结果：")
print(result["content"])
except Exception as e:
print(f"调用失败：{e}")
多轮对话示例（基于 chat/completions）
import requests
import json

chat_history = []
url = "http://localhost:8080/chat/completions"
headers = {"Content-Type": "application/json"}

def chat_with_model(prompt):
# 添加当前用户消息到历史
chat_history.append({"role": "user", "content": prompt})

data = {
 "model": "qwen.gguf",
 "messages": chat_history,
 "temperature": 0.7,
 "max_tokens": 512
}

try:
 response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
 response.raise_for_status()
 result = response.json()
 answer = result["choices"]["message"]["content"]

 # 添加助手回答到历史
 chat_history.append({"role": "assistant", "content": answer})
 return answer
except Exception as e:
 return f"调用失败：{e}"

# 多轮对话示例
print("开始多轮对话（输入'退出'结束）：")
while True:
user_input = input("你：")
if user_input == "退出":
 break
answer = chat_with_model(user_input)
print(f"助手：{answer}\n")
带有对话记忆功能测试
import requests
import json
import re

# 初始化对话历史（包含系统提示，引导模型记上下文）
chat_history = [
{"role": "system", "content": "你是一个有帮助的助手，必须记住之前的对话内容，基于上下文回答用户问题。"}
]
# 你的服务实际地址（保持你原来的 11433 端口和 OpenAI 兼容路径）
url = "http://localhost:11433/chat/completions"
headers = {"Content-Type": "application/json"}

def clean_pad_content(content):
"""过滤模型返回的垃圾字符"""
return re.sub(r'\', '', content).strip()

def chat_with_model(prompt):
global chat_history

# 添加当前用户消息到历史（关键：上下文靠这个列表传递）
chat_history.append({"role": "user", "content": prompt})

data = {
 "model": "qwen.gguf",# 保持你原来的模型名（你的服务识别这个名字）
 "messages": chat_history,# 传递完整对话历史
 "temperature": 0.7,
 "max_tokens": 512,
 "stream": False,# 关闭流式输出，适配你的返回格式
 "stop": ["# 提前终止 PAD 字符的输出
}

try:
 response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
 response.raise_for_status()# 触发 HTTP 错误（比如 404、500）

 result = response.json()
 print(f"调试：模型原始返回 = {json.dumps(result, ensure_ascii=False)[:500]}")# 可选：查看原始返回

 # 适配你的 OpenAI 兼容格式：从 choices.message.content 提取内容
 if "choices" in result and len(result["choices"]) > 0:
 choice = result["choices"]
 if "message" in choice and "content" in choice["message"]:
 raw_answer = choice["message"]["content"]
 answer = clean_pad_content(raw_answer)# 过滤 PAD 垃圾字符

 # 关键：将助手回复加入历史，下次请求会带上
 chat_history.append({"role": "assistant", "content": answer})
 return answer
 else:
 return f"返回格式异常：缺少 message/content 字段，原始返回：{json.dumps(result, ensure_ascii=False)[:300]}"
 else:
 return f"返回格式异常：缺少 choices 字段，原始返回：{json.dumps(result, ensure_ascii=False)[:300]}"

except requests.exceptions.ConnectionError:
 return "连接失败：请检查本地服务是否在 11433 端口运行"
except requests.exceptions.Timeout:
 return "请求超时：模型响应过慢"
except Exception as e:
 return f"调用失败：{str(e)}，原始返回：{response.text[:300] if 'response' in locals() else '无'}"

# 多轮对话测试（重点测试上下文记忆）
print("开始多轮对话（输入'退出'结束）：")
print("提示：先发送 '我的名字是李四'，再发送 '我叫什么名字' 测试记忆功能\n")
while True:
user_input = input("你：")
if user_input.strip() == "退出":
 break
if not user_input.strip():
 print("助手：请输入有效内容！\n")
 continue
answer = chat_with_model(user_input)
print(f"助手：{answer}\n")
函数工具调用测试
<code >import requests
import json
import re
from datetime import datetime

# ====================== 1. 定义可用工具集 ======================
# 工具1：获取当前时间
def get_current_time():
"""获取当前的本地时间，格式为年-月-日时:分:秒"""
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"当前时间为：{current_time}"

# 工具2：加法计算
def calculate_add(a: float, b: float):
"""计算两个数的加法结果"""
return f"{a} + {b} = {a + b}"

# 工具注册表（核心：映射工具名到函数和描述，供模型识别）
tool_registry = {
"get_current_time": {
 "function": get_current_time,
 "description": "获取当前的本地时间，无需参数",
 "parameters": {}# 无参数
},
"calculate_add": {
 "function": calculate_add,
 "description": "计算两个数字的加法，需要两个参数：a（数字）、b（数字）",
 "parameters": {
 "a": {"type": "float", "required": True, "description": "加数1"},
 "b": {"type": "float", "required": True, "description": "加数2"}
 }
}
}

# ====================== 2. 初始化对话历史和基础配置 ======================
chat_history = [
{"role": "system", "content": """你是一个有帮助的助手，必须记住之前的对话内容，基于上下文回答用户问题。
你可以调用以下工具来辅助回答：
1. get_current_time：获取当前的本地时间，无需参数
2. calculate_add：计算两个数字的加法，需要参数a和b（均为数字）

如果需要调用工具，请严格按照以下JSON格式返回（仅返回JSON，不要加其他内容）：
{"name": "工具名", "parameters": {"参数名": 参数值}}

如果不需要调用工具，直接回答用户问题即可，不要返回JSON格式。"""}
]

# 本地LLM服务地址
url = "http://localhost:11433/chat/completions"
headers = {"Content-Type": "application/json"}

# ====================== 3. 工具调用相关辅助函数 ======================
def clean_pad_content(content):
"""过滤模型返回的垃圾字符"""
return re.sub(r'\', '', content).strip()

def parse_tool_call(content):
"""解析模型返回的内容，提取工具调用指令（JSON格式）"""
try:
 # 提取JSON部分（兼容模型返回时可能带的多余文字）
 json_match = re.search(r'\{[\s\S]*\}', content)
 if not json_match:
 return None
 tool_call = json.loads(json_match.group())
 # 验证必要字段
 if "name" in tool_call and "parameters" in tool_call:
 return tool_call
 return None
except (json.JSONDecodeError, Exception):
 return None

def execute_tool(tool_call):
"""执行工具调用，返回执行结果"""
tool_name = tool_call["name"]
parameters = tool_call.get("parameters", {})

# 检查工具是否存在
if tool_name not in tool_registry:
 return f"错误：不存在名为 {tool_name} 的工具，可用工具：{list(tool_registry.keys())}"

tool_info = tool_registry
tool_func = tool_info["function"]
tool_params = tool_info["parameters"]

# 验证必填参数
missing_params = []
for param_name, param_info in tool_params.items():
 if param_info.get("required") and param_name not in parameters:
 missing_params.append(param_name)
if missing_params:
 return f"错误：调用 {tool_name} 缺少必填参数：{', '.join(missing_params)}"

# 转换参数类型（比如字符串转数字）
try:
 for param_name, param_info in tool_params.items():
 if param_name in parameters:
 param_type = param_info.get("type", "str")
 if param_type == "float":
 parameters = float(parameters)
 elif param_type == "int":
 parameters = int(parameters)
except ValueError as e:
 return f"错误：参数类型转换失败 - {str(e)}"

# 执行工具函数
try:
 result = tool_func(**parameters)
 return f"工具调用成功（{tool_name}）：{result}"
except Exception as e:
 return f"错误：执行 {tool_name} 失败 - {str(e)}"

# ====================== 4. 核心对话函数（支持工具调用） ======================
def chat_with_model(prompt):
global chat_history

# 添加当前用户消息到历史
chat_history.append({"role": "user", "content": prompt})

# 第一步：发送请求，判断是否需要调用工具
data = {
 "model": "qwen.gguf",
 "messages": chat_history,
 "temperature": 0.7,
 "max_tokens": 512,
 "stream": False,
 "stop": ["
}

try:
 # 第一次调用模型：获取是否需要工具调用的响应
 response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
 response.raise_for_status()
 result = response.json()

 # 解析模型原始返回
 if "choices" in result and len(result["choices"]) > 0 and "message" in result["choices"]:
 raw_answer = result["choices"]["message"]["content"]
 clean_answer = clean_pad_content(raw_answer)
 else:
 return f"返回格式异常：{json.dumps(result, ensure_ascii=False)[:300]}"

 # 解析是否包含工具调用指令
 tool_call = parse_tool_call(clean_answer)
 if tool_call:
 print(f" 来源：程序园用户自行投稿发布，如果侵权，请联系站长删除 免责声明：如果侵犯了您的权益，请联系站长，我们会及时删除侵权内容，谢谢合作！

供挂发表于 2026-1-14 09:23:10

谢谢分享，辛苦了

芮梦月 发表于 2026-1-17 02:35:30

yyds。多谢分享

骆贵发表于 2026-1-18 23:20:46

感谢发布原创作品，程序园因你更精彩

褐洌发表于 2026-1-21 09:23:04

感谢分享

颜清华 发表于 2026-1-24 10:18:42

很好很强大我过来先占个楼待编辑

供挂发表于 2026-1-25 03:25:13

东西不错很实用谢谢分享

啪炽发表于 2026-1-25 10:19:44

感谢分享，下载保存了，貌似很强大

睁扼妤 发表于 2026-1-26 09:35:01

感谢分享，下载保存了，貌似很强大

旱由发表于 2026-1-26 09:39:20

收藏一下不知道什么时候能用到

雨角发表于 2026-1-27 08:31:56

很好很强大我过来先占个楼待编辑

顾星发表于 2026-1-28 02:32:01

yyds。多谢分享

磁呃泵 发表于 2026-1-30 02:40:03

这个好，看起来很实用

鞣谘坡 发表于 2026-2-3 05:38:53

东西不错很实用谢谢分享

福清婉 发表于 2026-2-3 07:38:35

过来提前占个楼

杜优瑗 发表于 2026-2-8 14:03:58

前排留名，哈哈哈

皇甫佳文 发表于 2026-2-8 23:11:07

谢谢楼主提供！

何书艺 发表于 2026-2-9 23:15:01

过来提前占个楼

狙兕发表于 2026-2-10 04:48:54

懂技术并乐意极积无私分享的人越来越少。珍惜

米榜饴 发表于 2026-2-10 10:59:03

前排留名，哈哈哈

页: [1] 2

程序园's Archiver

Windows 环境下 llama.cpp 编译 + Qwen 模型本地部署全指南