大语言模型工程化实践:从提示设计到能力评估的完整体系

# 大语言模型工程化实践:从提示设计到能力评估的完整体系


在大语言模型应用日益普及的今天,如何系统化地设计提示词并科学评估模型能力成为关键挑战。本文将深入探讨从监督微调(SFT)到模型上下文协议(MCP)的完整技术栈,构建可工程化的LLM应用体系。


## 系统化提示工程框架


构建可维护、可测试的提示工程体系是LLM应用工程化的基础。


```python

# prompt_engineering/framework.py

from abc import ABC, abstractmethod

from typing import Dict, List, Any, Optional

from dataclasses import dataclass

from enum import Enum

import json

<"www.share.kuww.com.cn">

<"share.m.kuww.com.cn">

<"tv.share.kuww.com.cn">


class PromptRole(Enum):

    SYSTEM = "system"

    USER = "user"

    ASSISTANT = "assistant"


@dataclass

class Message:

    role: PromptRole

    content: str

    metadata: Optional[Dict[str, Any]] = None


class PromptTemplate:

    """可复用的提示模板"""

    

    def __init__(self, name: str, template: str, variables: List[str]):

        self.name = name

        self.template = template

        self.variables = variables

        self.version = "1.0"

    

    def render(self, **kwargs) -> str:

        """渲染模板"""

        missing_vars = set(self.variables) - set(kwargs.keys())

        if missing_vars:

            raise ValueError(f"缺少必要变量: {missing_vars}")

        

        content = self.template

        for var, value in kwargs.items():

            content = content.replace(f"{{{var}}}", str(value))

        return content

<"read.share.kuww.com.cn">

<"share.tv.kuww.com.cn">

<"live.share.kuww.com.cn">

    

    def validate_inputs(self, inputs: Dict[str, Any]) -> bool:

        """验证输入参数"""

        return all(var in inputs for var in self.variables)


class ConversationBuilder:

    """对话构建器"""

    

    def __init__(self):

        self.messages: List[Message] = []

    

    def add_system_message(self, content: str, **kwargs) -> 'ConversationBuilder':

        """添加系统消息"""

        if kwargs:

            content = content.format(**kwargs)

        self.messages.append(Message(PromptRole.SYSTEM, content))

        return self

    

    def add_user_message(self, content: str, **kwargs) -> 'ConversationBuilder':

        """添加用户消息"""

        if kwargs:

            content = content.format(**kwargs)

        self.messages.append(Message(PromptRole.USER, content))

        return self

    

    def add_assistant_message(self, content: str) -> 'ConversationBuilder':

        """添加助手消息"""

        self.messages.append(Message(PromptRole.ASSISTANT, content))

        return self

    

    def build(self) -> List[Dict[str, str]]:

        """构建对话格式"""

        return [{"role": msg.role.value, "content": msg.content} 

                for msg in self.messages]

    

    def get_message_sequence(self) -> List[Message]:

        """获取消息序列"""

        return self.messages.copy()


# 预定义模板库

class PromptTemplateLibrary:

    """提示模板库"""

<"wap.share.kuww.com.cn">

<"share.share.kuww.com.cn">

<"gov.cn.kuww.com.cn">

    

    def __init__(self):

        self.templates = {}

        self._initialize_templates()

    

    def _initialize_templates(self):

        """初始化模板"""

        # 分析类模板

        self.templates["text_analysis"] = PromptTemplate(

            name="text_analysis",

            template="""请分析以下文本:


{text}


分析要求:

1. 提取关键信息

2. 识别主要观点

3. 评估文本质量

4. 提供改进建议


请以JSON格式返回分析结果。""",

            variables=["text"]

        )

        

        # 代码生成模板

        self.templates["code_generation"] = PromptTemplate(

            name="code_generation",

            template="""请根据以下需求生成{language}代码:


需求:{requirement}


要求:

- 包含适当的注释

- 处理边界条件

- 遵循{language}最佳实践

- 包含简单的使用示例


只返回代码,不要解释。""",

            variables=["language", "requirement"]

        )

        

        # 推理类模板

        self.templates["complex_reasoning"] = PromptTemplate(

            name="complex_reasoning",

            template="""请基于以下信息进行推理:


背景信息:{context}

问题:{question}

<"nba.share.kuww.com.cn">

<"www.share.zhongguoinfo.com">

<"share.m.zhongguoinfo.com">


推理步骤:

1. 分析已知信息

2. 识别信息缺口

3. 进行逻辑推理

4. 得出结论并说明置信度


请逐步展示推理过程。""",

            variables=["context", "question"]

        )

    

    def get_template(self, name: str) -> PromptTemplate:

        """获取模板"""

        if name not in self.templates:

            raise ValueError(f"模板不存在: {name}")

        return self.templates[name]

```


## 监督微调(SFT)工程化实践


SFT是将领域知识注入模型的关键技术,需要系统化的数据处理和训练流程。


```python

# sft/training_pipeline.py

import torch

<"tv.share.zhongguoinfo.com">

<"read.share.zhongguoinfo.com">

<"share.tv.zhongguoinfo.com">

from transformers import Trainer, TrainingArguments

from typing import List, Dict, Any

import datasets

from dataclasses import dataclass


@dataclass

class SFTExample:

    """SFT训练样本"""

    instruction: str

    input: str

    output: str

    source: str

    quality_score: float


class SFTDataProcessor:

    """SFT数据处理器"""

    

    def __init__(self, tokenizer, max_length: int = 2048):

        self.tokenizer = tokenizer

        self.max_length = max_length

    

    def format_conversation(self, example: SFTExample) -> str:

        """格式化对话"""

        if example.input:

            return f"### Instruction:\n{example.instruction}\n\n### Input:\n{example.input}\n\n### Response:\n{example.output}"

        else:

            return f"### Instruction:\n{example.instruction}\n\n### Response:\n{example.output}"

    

    def tokenize_function(self, examples: Dict[str, List]) -> Dict[str, List]:

        """分词函数"""

        # 构建提示词

        prompts = []

        for i in range(len(examples["instruction"])):

            example = SFTExample(

                instruction=examples["instruction"][i],

                input=examples["input"][i],

                output=examples["output"][i],

                source=examples["source"][i],

                quality_score=examples["quality_score"][i]

            )

            prompt = self.format_conversation(example)

            prompts.append(prompt)

        

        # 分词

        tokenized = self.tokenizer(

            prompts,

            truncation=True,

            padding=False,

            max_length=self.max_length,

            return_tensors=None

        )

<"live.share.zhongguoinfo.com">

<"wap.share.zhongguoinfo.com">

<"share.share.zhongguoinfo.com">

        

        # 设置标签(忽略指令部分的loss)

        labels = []

        for i in range(len(tokenized["input_ids"])):

            input_ids = tokenized["input_ids"][i]

            attention_mask = tokenized["attention_mask"][i]

            

            # 找到"### Response:"的位置

            response_token = self.tokenizer.encode("### Response:")[1:]  # 忽略bos token

            response_start = self.find_sublist(input_ids, response_token)

            

            if response_start != -1:

                # 只计算response部分的loss

                label = [-100] * len(input_ids)

                for j in range(response_start + len(response_token), len(input_ids)):

                    label[j] = input_ids[j]

            else:

                label = input_ids.copy()

            

            labels.append(label)

        

        tokenized["labels"] = labels

        return tokenized

    

    def find_sublist(self, main_list: List, sublist: List) -> int:

        """在列表中查找子列表"""

        sublen = len(sublist)

        for i in range(len(main_list) - sublen + 1):

            if main_list[i:i+sublen] == sublist:

                return i

        return -1


class SFTTrainer:

    """SFT训练器"""

    

    def __init__(self, model, tokenizer, training_args: Dict[str, Any]):

        self.model = model

        self.tokenizer = tokenizer

        self.training_args = training_args

        self.data_processor = SFTDataProcessor(tokenizer)

    

    def prepare_dataset(self, examples: List[SFTExample]) -> datasets.Dataset:

        """准备训练数据集"""

        data_dict = {

            "instruction": [ex.instruction for ex in examples],

            "input": [ex.input for ex in examples],

            "output": [ex.output for ex in examples],

            "source": [ex.source for ex in examples],

            "quality_score": [ex.quality_score for ex in examples]

        }

        

        dataset = datasets.Dataset.from_dict(data_dict)

        tokenized_dataset = dataset.map(

            self.data_processor.tokenize_function,

            batched=True,

            remove_columns=dataset.column_names

        )

        return tokenized_dataset

    

    def train(self, train_examples: List[SFTExample], 

              eval_examples: List[SFTExample] = None):

        """训练模型"""

        train_dataset = self.prepare_dataset(train_examples)

        

        eval_dataset = None

        if eval_examples:

            eval_dataset = self.prepare_dataset(eval_examples)

        

        training_args = TrainingArguments(

            output_dir=self.training_args["output_dir"],

            num_train_epochs=self.training_args.get("num_train_epochs", 3),

            per_device_train_batch_size=self.training_args.get("batch_size", 4),

            gradient_accumulation_steps=self.training_args.get("gradient_accumulation_steps", 1),

            learning_rate=self.training_args.get("learning_rate", 2e-5),

            warmup_steps=self.training_args.get("warmup_steps", 100),

            logging_dir=self.training_args.get("logging_dir", "./logs"),

            logging_steps=self.training_args.get("logging_steps", 10),

            evaluation_strategy="steps" if eval_dataset else "no",

            eval_steps=self.training_args.get("eval_steps", 100),

            save_steps=self.training_args.get("save_steps", 500),

            load_best_model_at_end=True if eval_dataset else False,

        )

        

        trainer = Trainer(

            model=self.model,

            args=training_args,

            train_dataset=train_dataset,

            eval_dataset=eval_dataset,

            tokenizer=self.tokenizer,

        )

        

        trainer.train()

        return trainer

<"gov.cn.zhongguoinfo.com">

<"nba.share.zhongguoinfo.com">

<"kuww.com.cn">

```


## RAG系统评估体系


构建科学的RAG评估指标,确保检索和生成质量。


```python

# evaluation/rag_metrics.py

from typing import List, Dict, Any, Tuple

import numpy as np

from sklearn.metrics.precision_recall_fscore_support import precision_recall_fscore_support

from rouge_score import rouge_scorer


class RAGEvaluator:

    """RAG系统评估器"""

    

    def __init__(self):

        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    

    def evaluate_retrieval_quality(self, retrieved_docs: List[str], 

                                 relevant_docs: List[str]) -> Dict[str, float]:

        """评估检索质量"""

        if not retrieved_docs or not relevant_docs:

            return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

        

        # 简单的基于文本匹配的评估

        retrieved_set = set(self.normalize_text(doc) for doc in retrieved_docs)

        relevant_set = set(self.normalize_text(doc) for doc in relevant_docs)

        

        true_positives = len(retrieved_set & relevant_set)

        false_positives = len(retrieved_set - relevant_set)

        false_negatives = len(relevant_set - retrieved_set)

        

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        

        return {

            "precision": precision,

            "recall": recall, 

            "f1": f1,

            "retrieved_count": len(retrieved_docs),

            "relevant_count": len(relevant_docs)

<"zhongguoinfo.com">

<"www.kuww.com.cn">

<"www.zhongguoinfo.com">

        }

    

    def evaluate_answer_quality(self, generated_answer: str, 

                              reference_answer: str,

                              context: List[str] = None) -> Dict[str, float]:

        """评估答案质量"""

        # ROUGE指标

        rouge_scores = self.rouge_scorer.score(reference_answer, generated_answer)

        

        metrics = {

            "rouge1": rouge_scores['rouge1'].fmeasure,

            "rouge2": rouge_scores['rouge2'].fmeasure,

            "rougeL": rouge_scores['rougeL'].fmeasure,

        }

        

        # 事实一致性(如果提供了上下文)

        if context:

            factual_consistency = self.calculate_factual_consistency(generated_answer, context)

            metrics["factual_consistency"] = factual_consistency

        

        # 答案相关性

        answer_relevance = self.calculate_answer_relevance(generated_answer, reference_answer)

        metrics["answer_relevance"] = answer_relevance

        

        return metrics

    

    def calculate_factual_consistency(self, answer: str, context: List[str]) -> float:

        """计算事实一致性"""

        # 简化的实现 - 实际应用中可以使用NLI模型

        answer_sentences = self.split_into_sentences(answer)

        context_text = " ".join(context)

        

        consistent_count = 0

        for sentence in answer_sentences:

            if self.check_sentence_in_context(sentence, context_text):

                consistent_count += 1

        

        return consistent_count / len(answer_sentences) if answer_sentences else 0.0

    

    def calculate_answer_relevance(self, generated: str, reference: str) -> float:

        """计算答案相关性"""

        # 使用文本相似度作为相关性代理

        from sklearn.feature_extraction.text import TfidfVectorizer

        from sklearn.metrics.pairwise import cosine_similarity

        

        vectorizer = TfidfVectorizer().fit([generated, reference])

        vectors = vectorizer.transform([generated, reference])

        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

        

        return similarity

    

    def normalize_text(self, text: str) -> str:

        """文本标准化"""

        import re

        text = text.lower().strip()

        text = re.sub(r'[^\w\s]', '', text)

        text = re.sub(r'\s+', ' ', text)

        return text

    

    def split_into_sentences(self, text: str) -> List[str]:

        """分句"""

        import re

        sentences = re.split(r'[.!?]+', text)

        return [s.strip() for s in sentences if s.strip()]

    

    def check_sentence_in_context(self, sentence: str, context: str) -> bool:

        """检查句子是否在上下文中"""

        normalized_sentence = self.normalize_text(sentence)

        normalized_context = self.normalize_text(context)

        return normalized_sentence in normalized_context


class BenchmarkSuite:

<"m.kuww.com.cn">

<"m.zhongguoinfo.com">

<"www.share.hbgufen.com">

    """基准测试套件"""

    

    def __init__(self):

        self.evaluator = RAGEvaluator()

        self.test_cases = []

    

    def add_test_case(self, query: str, reference_answer: str, 

                     relevant_docs: List[str], context: Dict[str, Any] = None):

        """添加测试用例"""

        self.test_cases.append({

            "query": query,

            "reference_answer": reference_answer,

            "relevant_docs": relevant_docs,

            "context": context or {}

        })

    

    def run_benchmark(self, rag_system) -> Dict[str, Any]:

        """运行基准测试"""

        results = []

        

        for test_case in self.test_cases:

            # 使用RAG系统处理查询

            system_result = rag_system.process_query(

                test_case["query"], 

                test_case.get("context", {})

            )

            

            # 评估检索质量

            retrieval_metrics = self.evaluator.evaluate_retrieval_quality(

                system_result.get("retrieved_documents", []),

                test_case["relevant_docs"]

            )

            

            # 评估生成质量

            generation_metrics = self.evaluator.evaluate_answer_quality(

                system_result.get("generated_answer", ""),

                test_case["reference_answer"],

                system_result.get("retrieved_documents", [])

            )

            

            results.append({

                "query": test_case["query"],

                "retrieval_metrics": retrieval_metrics,

                "generation_metrics": generation_metrics,

                "system_result": system_result

            })

        

        # 计算平均指标

        avg_retrieval = self.calculate_average_metrics([r["retrieval_metrics"] for r in results])

        avg_generation = self.calculate_average_metrics([r["generation_metrics"] for r in results])

        

        return {

            "detailed_results": results,

            "average_metrics": {

                "retrieval": avg_retrieval,

                "generation": avg_generation

            },

            "summary": self.generate_summary(avg_retrieval, avg_generation)

        }

    

    def calculate_average_metrics(self, metrics_list: List[Dict]) -> Dict[str, float]:

        """计算平均指标"""

        if not metrics_list:

            return {}

        

        avg_metrics = {}

        for key in metrics_list[0].keys():

            if isinstance(metrics_list[0][key], (int, float)):

                values = [m[key] for m in metrics_list if key in m]

                avg_metrics[key] = sum(values) / len(values) if values else 0.0

        

        return avg_metrics

    

    def generate_summary(self, retrieval_metrics: Dict, generation_metrics: Dict) -> str:

        """生成总结报告"""

        retrieval_f1 = retrieval_metrics.get('f1', 0)

        generation_rougeL = generation_metrics.get('rougeL', 0)

        

        if retrieval_f1 > 0.8 and generation_rougeL > 0.7:

            return "优秀:检索和生成质量都很高"

        elif retrieval_f1 > 0.6 and generation_rougeL > 0.5:

            return "良好:系统表现稳定"

        else:

            return "需要改进:系统性能有待提升"

```


## Function Calling与MCP集成


实现标准化的工具调用和协议集成。


```python

# function_calling/integration.py

from typing import Dict, List, Any, Callable, get_type_hints

import inspect

import json

from mcp import MCPServer


class FunctionCallingEngine:

    """函数调用引擎"""

    

    def __init__(self):

        self.functions = {}

        self.function_schemas = {}

    

    def register_function(self, func: Callable, description: str = None) -> str:

        """注册函数"""

        func_name = func.__name__

        self.functions[func_name] = func

        

        # 生成OpenAI兼容的函数schema

        schema = self._generate_function_schema(func, description)

        self.function_schemas[func_name] = schema

        

        return func_name

    

    def _generate_function_schema(self, func: Callable, description: str) -> Dict[str, Any]:

        """生成函数schema"""

        signature = inspect.signature(func)

        type_hints = get_type_hints(func)

        

        parameters = {}

        required = []

        

        for param_name, param in signature.parameters.items():

            param_type = type_hints.get(param_name, str)

            param_info = {

                "type": self._python_type_to_json_type(param_type),

                "description": f"参数 {param_name}"

            }

            

            if param.default == inspect.Parameter.empty:

                required.append(param_name)

            

            parameters[param_name] = param_info

        

        return {

            "name": func.__name__,

            "description": description or func.__doc__ or "",

            "parameters": {

                "type": "object",

                "properties": parameters,

                "required": required

            }

        }

    

    def _python_type_to_json_type(self, py_type: type) -> str:

        """Python类型转JSON schema类型"""

        type_mapping = {

            str: "string",

            int: "integer",

            float: "number",

            bool: "boolean",

            list: "array",

            dict: "object"

        }

        return type_mapping.get(py_type, "string")

    

    async def execute_function_call(self, function_name: str, arguments: Dict[str, Any]) -> Any:

        """执行函数调用"""

        if function_name not in self.functions:

            raise ValueError(f"函数未注册: {function_name}")

        

        func = self.functions[function_name]

        

        try:

            # 验证参数

            self._validate_arguments(func, arguments)

            

            # 执行函数

            if inspect.iscoroutinefunction(func):

                result = await func(**arguments)

            else:

                result = func(**arguments)

                

            return result

            

        except Exception as e:

            raise RuntimeError(f"函数执行失败: {str(e)}")

    

    def _validate_arguments(self, func: Callable, arguments: Dict[str, Any]):

        """验证参数"""

        signature = inspect.signature(func)

        

        for param_name, param in signature.parameters.items():

            if param_name not in arguments and param.default == inspect.Parameter.empty:

                raise ValueError(f"缺少必需参数: {param_name}")

        

        for arg_name in arguments.keys():

            if arg_name not in signature.parameters:

                raise ValueError(f"未知参数: {arg_name}")


class MCPFunctionBridge(MCPServer):

    """MCP函数桥接器"""

    

    def __init__(self, function_engine: FunctionCallingEngine):

        super().__init__()

        self.function_engine = function_engine

        self._register_tools()

    

    def _register_tools(self):

        """注册工具"""

        for func_name, schema in self.function_engine.function_schemas.items():

            self.tools[func_name] = {

                "description": schema["description"],

                "parameters": schema["parameters"]

            }

    

    async def handle_tool_call(self, tool_name: str, arguments: Dict[str, Any]) -> str:

        """处理工具调用"""

        try:

            result = await self.function_engine.execute_function_call(tool_name, arguments)

            return json.dumps({"result": result})

        except Exception as e:

            return json.dumps({"error": str(e)})


# 示例函数定义

def search_web(query: str, max_results: int = 5) -> str:

    """执行网络搜索"""

    # 模拟实现

    return f"搜索 '{query}' 的结果摘要..."


def calculate_expression(expression: str) -> float:

    """计算数学表达式"""

    try:

        return eval(expression)

    except Exception as e:

        raise ValueError(f"计算错误: {str(e)}")


async def get_weather(location: str) -> Dict[str, Any]:

    """获取天气信息"""

    # 模拟API调用

    return {

        "location": location,

        "temperature": 22.5,

        "condition": "晴朗",

        "humidity": 65

    }


# 使用示例

def setup_function_calling():

    """设置函数调用系统"""

    engine = FunctionCallingEngine()

    

    # 注册函数

    engine.register_function(search_web, "执行网络搜索")

    engine.register_function(calculate_expression, "计算数学表达式")

    engine.register_function(get_weather, "获取天气信息")

    

    # 创建MCP桥接

    mcp_bridge = MCPFunctionBridge(engine)

    

    return engine, mcp_bridge

```


通过系统化的提示工程、科学的评估体系和标准化的工具集成,我们可以构建出可靠、可评估、可维护的大语言模型应用。这种工程化方法确保了LLM应用在真实场景中的稳定性和实用性。


请使用浏览器的分享功能分享到微信等