# 大语言模型工程化实践:从提示设计到能力评估的完整体系
在大语言模型应用日益普及的今天,如何系统化地设计提示词并科学评估模型能力成为关键挑战。本文将深入探讨从监督微调(SFT)到模型上下文协议(MCP)的完整技术栈,构建可工程化的LLM应用体系。
## 系统化提示工程框架
构建可维护、可测试的提示工程体系是LLM应用工程化的基础。
```python
# prompt_engineering/framework.py
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from enum import Enum
import json
<"www.share.kuww.com.cn">
<"share.m.kuww.com.cn">
<"tv.share.kuww.com.cn">
class PromptRole(Enum):
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"
@dataclass
class Message:
role: PromptRole
content: str
metadata: Optional[Dict[str, Any]] = None
class PromptTemplate:
"""可复用的提示模板"""
def __init__(self, name: str, template: str, variables: List[str]):
self.name = name
self.template = template
self.variables = variables
self.version = "1.0"
def render(self, **kwargs) -> str:
"""渲染模板"""
missing_vars = set(self.variables) - set(kwargs.keys())
if missing_vars:
raise ValueError(f"缺少必要变量: {missing_vars}")
content = self.template
for var, value in kwargs.items():
content = content.replace(f"{{{var}}}", str(value))
return content
<"read.share.kuww.com.cn">
<"share.tv.kuww.com.cn">
<"live.share.kuww.com.cn">
def validate_inputs(self, inputs: Dict[str, Any]) -> bool:
"""验证输入参数"""
return all(var in inputs for var in self.variables)
class ConversationBuilder:
"""对话构建器"""
def __init__(self):
self.messages: List[Message] = []
def add_system_message(self, content: str, **kwargs) -> 'ConversationBuilder':
"""添加系统消息"""
if kwargs:
content = content.format(**kwargs)
self.messages.append(Message(PromptRole.SYSTEM, content))
return self
def add_user_message(self, content: str, **kwargs) -> 'ConversationBuilder':
"""添加用户消息"""
if kwargs:
content = content.format(**kwargs)
self.messages.append(Message(PromptRole.USER, content))
return self
def add_assistant_message(self, content: str) -> 'ConversationBuilder':
"""添加助手消息"""
self.messages.append(Message(PromptRole.ASSISTANT, content))
return self
def build(self) -> List[Dict[str, str]]:
"""构建对话格式"""
return [{"role": msg.role.value, "content": msg.content}
for msg in self.messages]
def get_message_sequence(self) -> List[Message]:
"""获取消息序列"""
return self.messages.copy()
# 预定义模板库
class PromptTemplateLibrary:
"""提示模板库"""
<"wap.share.kuww.com.cn">
<"share.share.kuww.com.cn">
<"gov.cn.kuww.com.cn">
def __init__(self):
self.templates = {}
self._initialize_templates()
def _initialize_templates(self):
"""初始化模板"""
# 分析类模板
self.templates["text_analysis"] = PromptTemplate(
name="text_analysis",
template="""请分析以下文本:
{text}
分析要求:
1. 提取关键信息
2. 识别主要观点
3. 评估文本质量
4. 提供改进建议
请以JSON格式返回分析结果。""",
variables=["text"]
)
# 代码生成模板
self.templates["code_generation"] = PromptTemplate(
name="code_generation",
template="""请根据以下需求生成{language}代码:
需求:{requirement}
要求:
- 包含适当的注释
- 处理边界条件
- 遵循{language}最佳实践
- 包含简单的使用示例
只返回代码,不要解释。""",
variables=["language", "requirement"]
)
# 推理类模板
self.templates["complex_reasoning"] = PromptTemplate(
name="complex_reasoning",
template="""请基于以下信息进行推理:
背景信息:{context}
问题:{question}
<"nba.share.kuww.com.cn">
<"www.share.zhongguoinfo.com">
<"share.m.zhongguoinfo.com">
推理步骤:
1. 分析已知信息
2. 识别信息缺口
3. 进行逻辑推理
4. 得出结论并说明置信度
请逐步展示推理过程。""",
variables=["context", "question"]
)
def get_template(self, name: str) -> PromptTemplate:
"""获取模板"""
if name not in self.templates:
raise ValueError(f"模板不存在: {name}")
return self.templates[name]
```
## 监督微调(SFT)工程化实践
SFT是将领域知识注入模型的关键技术,需要系统化的数据处理和训练流程。
```python
# sft/training_pipeline.py
import torch
<"tv.share.zhongguoinfo.com">
<"read.share.zhongguoinfo.com">
<"share.tv.zhongguoinfo.com">
from transformers import Trainer, TrainingArguments
from typing import List, Dict, Any
import datasets
from dataclasses import dataclass
@dataclass
class SFTExample:
"""SFT训练样本"""
instruction: str
input: str
output: str
source: str
quality_score: float
class SFTDataProcessor:
"""SFT数据处理器"""
def __init__(self, tokenizer, max_length: int = 2048):
self.tokenizer = tokenizer
self.max_length = max_length
def format_conversation(self, example: SFTExample) -> str:
"""格式化对话"""
if example.input:
return f"### Instruction:\n{example.instruction}\n\n### Input:\n{example.input}\n\n### Response:\n{example.output}"
else:
return f"### Instruction:\n{example.instruction}\n\n### Response:\n{example.output}"
def tokenize_function(self, examples: Dict[str, List]) -> Dict[str, List]:
"""分词函数"""
# 构建提示词
prompts = []
for i in range(len(examples["instruction"])):
example = SFTExample(
instruction=examples["instruction"][i],
input=examples["input"][i],
output=examples["output"][i],
source=examples["source"][i],
quality_score=examples["quality_score"][i]
)
prompt = self.format_conversation(example)
prompts.append(prompt)
# 分词
tokenized = self.tokenizer(
prompts,
truncation=True,
padding=False,
max_length=self.max_length,
return_tensors=None
)
<"live.share.zhongguoinfo.com">
<"wap.share.zhongguoinfo.com">
<"share.share.zhongguoinfo.com">
# 设置标签(忽略指令部分的loss)
labels = []
for i in range(len(tokenized["input_ids"])):
input_ids = tokenized["input_ids"][i]
attention_mask = tokenized["attention_mask"][i]
# 找到"### Response:"的位置
response_token = self.tokenizer.encode("### Response:")[1:] # 忽略bos token
response_start = self.find_sublist(input_ids, response_token)
if response_start != -1:
# 只计算response部分的loss
label = [-100] * len(input_ids)
for j in range(response_start + len(response_token), len(input_ids)):
label[j] = input_ids[j]
else:
label = input_ids.copy()
labels.append(label)
tokenized["labels"] = labels
return tokenized
def find_sublist(self, main_list: List, sublist: List) -> int:
"""在列表中查找子列表"""
sublen = len(sublist)
for i in range(len(main_list) - sublen + 1):
if main_list[i:i+sublen] == sublist:
return i
return -1
class SFTTrainer:
"""SFT训练器"""
def __init__(self, model, tokenizer, training_args: Dict[str, Any]):
self.model = model
self.tokenizer = tokenizer
self.training_args = training_args
self.data_processor = SFTDataProcessor(tokenizer)
def prepare_dataset(self, examples: List[SFTExample]) -> datasets.Dataset:
"""准备训练数据集"""
data_dict = {
"instruction": [ex.instruction for ex in examples],
"input": [ex.input for ex in examples],
"output": [ex.output for ex in examples],
"source": [ex.source for ex in examples],
"quality_score": [ex.quality_score for ex in examples]
}
dataset = datasets.Dataset.from_dict(data_dict)
tokenized_dataset = dataset.map(
self.data_processor.tokenize_function,
batched=True,
remove_columns=dataset.column_names
)
return tokenized_dataset
def train(self, train_examples: List[SFTExample],
eval_examples: List[SFTExample] = None):
"""训练模型"""
train_dataset = self.prepare_dataset(train_examples)
eval_dataset = None
if eval_examples:
eval_dataset = self.prepare_dataset(eval_examples)
training_args = TrainingArguments(
output_dir=self.training_args["output_dir"],
num_train_epochs=self.training_args.get("num_train_epochs", 3),
per_device_train_batch_size=self.training_args.get("batch_size", 4),
gradient_accumulation_steps=self.training_args.get("gradient_accumulation_steps", 1),
learning_rate=self.training_args.get("learning_rate", 2e-5),
warmup_steps=self.training_args.get("warmup_steps", 100),
logging_dir=self.training_args.get("logging_dir", "./logs"),
logging_steps=self.training_args.get("logging_steps", 10),
evaluation_strategy="steps" if eval_dataset else "no",
eval_steps=self.training_args.get("eval_steps", 100),
save_steps=self.training_args.get("save_steps", 500),
load_best_model_at_end=True if eval_dataset else False,
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=self.tokenizer,
)
trainer.train()
return trainer
<"gov.cn.zhongguoinfo.com">
<"nba.share.zhongguoinfo.com">
<"kuww.com.cn">
```
## RAG系统评估体系
构建科学的RAG评估指标,确保检索和生成质量。
```python
# evaluation/rag_metrics.py
from typing import List, Dict, Any, Tuple
import numpy as np
from sklearn.metrics.precision_recall_fscore_support import precision_recall_fscore_support
from rouge_score import rouge_scorer
class RAGEvaluator:
"""RAG系统评估器"""
def __init__(self):
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def evaluate_retrieval_quality(self, retrieved_docs: List[str],
relevant_docs: List[str]) -> Dict[str, float]:
"""评估检索质量"""
if not retrieved_docs or not relevant_docs:
return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
# 简单的基于文本匹配的评估
retrieved_set = set(self.normalize_text(doc) for doc in retrieved_docs)
relevant_set = set(self.normalize_text(doc) for doc in relevant_docs)
true_positives = len(retrieved_set & relevant_set)
false_positives = len(retrieved_set - relevant_set)
false_negatives = len(relevant_set - retrieved_set)
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"precision": precision,
"recall": recall,
"f1": f1,
"retrieved_count": len(retrieved_docs),
"relevant_count": len(relevant_docs)
<"zhongguoinfo.com">
<"www.kuww.com.cn">
<"www.zhongguoinfo.com">
}
def evaluate_answer_quality(self, generated_answer: str,
reference_answer: str,
context: List[str] = None) -> Dict[str, float]:
"""评估答案质量"""
# ROUGE指标
rouge_scores = self.rouge_scorer.score(reference_answer, generated_answer)
metrics = {
"rouge1": rouge_scores['rouge1'].fmeasure,
"rouge2": rouge_scores['rouge2'].fmeasure,
"rougeL": rouge_scores['rougeL'].fmeasure,
}
# 事实一致性(如果提供了上下文)
if context:
factual_consistency = self.calculate_factual_consistency(generated_answer, context)
metrics["factual_consistency"] = factual_consistency
# 答案相关性
answer_relevance = self.calculate_answer_relevance(generated_answer, reference_answer)
metrics["answer_relevance"] = answer_relevance
return metrics
def calculate_factual_consistency(self, answer: str, context: List[str]) -> float:
"""计算事实一致性"""
# 简化的实现 - 实际应用中可以使用NLI模型
answer_sentences = self.split_into_sentences(answer)
context_text = " ".join(context)
consistent_count = 0
for sentence in answer_sentences:
if self.check_sentence_in_context(sentence, context_text):
consistent_count += 1
return consistent_count / len(answer_sentences) if answer_sentences else 0.0
def calculate_answer_relevance(self, generated: str, reference: str) -> float:
"""计算答案相关性"""
# 使用文本相似度作为相关性代理
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer().fit([generated, reference])
vectors = vectorizer.transform([generated, reference])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
return similarity
def normalize_text(self, text: str) -> str:
"""文本标准化"""
import re
text = text.lower().strip()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text)
return text
def split_into_sentences(self, text: str) -> List[str]:
"""分句"""
import re
sentences = re.split(r'[.!?]+', text)
return [s.strip() for s in sentences if s.strip()]
def check_sentence_in_context(self, sentence: str, context: str) -> bool:
"""检查句子是否在上下文中"""
normalized_sentence = self.normalize_text(sentence)
normalized_context = self.normalize_text(context)
return normalized_sentence in normalized_context
class BenchmarkSuite:
<"m.kuww.com.cn">
<"m.zhongguoinfo.com">
<"www.share.hbgufen.com">
"""基准测试套件"""
def __init__(self):
self.evaluator = RAGEvaluator()
self.test_cases = []
def add_test_case(self, query: str, reference_answer: str,
relevant_docs: List[str], context: Dict[str, Any] = None):
"""添加测试用例"""
self.test_cases.append({
"query": query,
"reference_answer": reference_answer,
"relevant_docs": relevant_docs,
"context": context or {}
})
def run_benchmark(self, rag_system) -> Dict[str, Any]:
"""运行基准测试"""
results = []
for test_case in self.test_cases:
# 使用RAG系统处理查询
system_result = rag_system.process_query(
test_case["query"],
test_case.get("context", {})
)
# 评估检索质量
retrieval_metrics = self.evaluator.evaluate_retrieval_quality(
system_result.get("retrieved_documents", []),
test_case["relevant_docs"]
)
# 评估生成质量
generation_metrics = self.evaluator.evaluate_answer_quality(
system_result.get("generated_answer", ""),
test_case["reference_answer"],
system_result.get("retrieved_documents", [])
)
results.append({
"query": test_case["query"],
"retrieval_metrics": retrieval_metrics,
"generation_metrics": generation_metrics,
"system_result": system_result
})
# 计算平均指标
avg_retrieval = self.calculate_average_metrics([r["retrieval_metrics"] for r in results])
avg_generation = self.calculate_average_metrics([r["generation_metrics"] for r in results])
return {
"detailed_results": results,
"average_metrics": {
"retrieval": avg_retrieval,
"generation": avg_generation
},
"summary": self.generate_summary(avg_retrieval, avg_generation)
}
def calculate_average_metrics(self, metrics_list: List[Dict]) -> Dict[str, float]:
"""计算平均指标"""
if not metrics_list:
return {}
avg_metrics = {}
for key in metrics_list[0].keys():
if isinstance(metrics_list[0][key], (int, float)):
values = [m[key] for m in metrics_list if key in m]
avg_metrics[key] = sum(values) / len(values) if values else 0.0
return avg_metrics
def generate_summary(self, retrieval_metrics: Dict, generation_metrics: Dict) -> str:
"""生成总结报告"""
retrieval_f1 = retrieval_metrics.get('f1', 0)
generation_rougeL = generation_metrics.get('rougeL', 0)
if retrieval_f1 > 0.8 and generation_rougeL > 0.7:
return "优秀:检索和生成质量都很高"
elif retrieval_f1 > 0.6 and generation_rougeL > 0.5:
return "良好:系统表现稳定"
else:
return "需要改进:系统性能有待提升"
```
## Function Calling与MCP集成
实现标准化的工具调用和协议集成。
```python
# function_calling/integration.py
from typing import Dict, List, Any, Callable, get_type_hints
import inspect
import json
from mcp import MCPServer
class FunctionCallingEngine:
"""函数调用引擎"""
def __init__(self):
self.functions = {}
self.function_schemas = {}
def register_function(self, func: Callable, description: str = None) -> str:
"""注册函数"""
func_name = func.__name__
self.functions[func_name] = func
# 生成OpenAI兼容的函数schema
schema = self._generate_function_schema(func, description)
self.function_schemas[func_name] = schema
return func_name
def _generate_function_schema(self, func: Callable, description: str) -> Dict[str, Any]:
"""生成函数schema"""
signature = inspect.signature(func)
type_hints = get_type_hints(func)
parameters = {}
required = []
for param_name, param in signature.parameters.items():
param_type = type_hints.get(param_name, str)
param_info = {
"type": self._python_type_to_json_type(param_type),
"description": f"参数 {param_name}"
}
if param.default == inspect.Parameter.empty:
required.append(param_name)
parameters[param_name] = param_info
return {
"name": func.__name__,
"description": description or func.__doc__ or "",
"parameters": {
"type": "object",
"properties": parameters,
"required": required
}
}
def _python_type_to_json_type(self, py_type: type) -> str:
"""Python类型转JSON schema类型"""
type_mapping = {
str: "string",
int: "integer",
float: "number",
bool: "boolean",
list: "array",
dict: "object"
}
return type_mapping.get(py_type, "string")
async def execute_function_call(self, function_name: str, arguments: Dict[str, Any]) -> Any:
"""执行函数调用"""
if function_name not in self.functions:
raise ValueError(f"函数未注册: {function_name}")
func = self.functions[function_name]
try:
# 验证参数
self._validate_arguments(func, arguments)
# 执行函数
if inspect.iscoroutinefunction(func):
result = await func(**arguments)
else:
result = func(**arguments)
return result
except Exception as e:
raise RuntimeError(f"函数执行失败: {str(e)}")
def _validate_arguments(self, func: Callable, arguments: Dict[str, Any]):
"""验证参数"""
signature = inspect.signature(func)
for param_name, param in signature.parameters.items():
if param_name not in arguments and param.default == inspect.Parameter.empty:
raise ValueError(f"缺少必需参数: {param_name}")
for arg_name in arguments.keys():
if arg_name not in signature.parameters:
raise ValueError(f"未知参数: {arg_name}")
class MCPFunctionBridge(MCPServer):
"""MCP函数桥接器"""
def __init__(self, function_engine: FunctionCallingEngine):
super().__init__()
self.function_engine = function_engine
self._register_tools()
def _register_tools(self):
"""注册工具"""
for func_name, schema in self.function_engine.function_schemas.items():
self.tools[func_name] = {
"description": schema["description"],
"parameters": schema["parameters"]
}
async def handle_tool_call(self, tool_name: str, arguments: Dict[str, Any]) -> str:
"""处理工具调用"""
try:
result = await self.function_engine.execute_function_call(tool_name, arguments)
return json.dumps({"result": result})
except Exception as e:
return json.dumps({"error": str(e)})
# 示例函数定义
def search_web(query: str, max_results: int = 5) -> str:
"""执行网络搜索"""
# 模拟实现
return f"搜索 '{query}' 的结果摘要..."
def calculate_expression(expression: str) -> float:
"""计算数学表达式"""
try:
return eval(expression)
except Exception as e:
raise ValueError(f"计算错误: {str(e)}")
async def get_weather(location: str) -> Dict[str, Any]:
"""获取天气信息"""
# 模拟API调用
return {
"location": location,
"temperature": 22.5,
"condition": "晴朗",
"humidity": 65
}
# 使用示例
def setup_function_calling():
"""设置函数调用系统"""
engine = FunctionCallingEngine()
# 注册函数
engine.register_function(search_web, "执行网络搜索")
engine.register_function(calculate_expression, "计算数学表达式")
engine.register_function(get_weather, "获取天气信息")
# 创建MCP桥接
mcp_bridge = MCPFunctionBridge(engine)
return engine, mcp_bridge
```
通过系统化的提示工程、科学的评估体系和标准化的工具集成,我们可以构建出可靠、可评估、可维护的大语言模型应用。这种工程化方法确保了LLM应用在真实场景中的稳定性和实用性。