LazyGraphRAG是微软研究院最新推出的知识检索技术,是GraphRAG的改进版本,具有以下核心优势:
bash# 使用Anaconda创建环境
conda create -n lazygraphrag python=3.10
conda activate lazygraphrag
# 或使用pyenv创建环境
pyenv install 3.10.x
pyenv global 3.10.x
python -m venv lazygraphrag_env
source lazygraphrag_env/bin/activate # Linux/Mac
# Windows: lazygraphrag_env\Scripts\activate
由于LazyGraphRAG是GraphRAG的一部分,我们需要先安装GraphRAG:
bash# 克隆GraphRAG存储库
git clone https://github.com/microsoft/graphrag.git
cd graphrag
# 安装GraphRAG
pip install -e .
bashmkdir -p ~/lazygraphrag_project
cd ~/lazygraphrag_project
bashgraphrag init --root ./graphrag
编辑./graphrag/settings.yml
文件,设置语言模型API密钥:
yamlmodels:
default_chat_model:
type: openai_chat # or azure_openai_chat
api_base: https://ark.cn-beijing.volces.com/api/v3 #这里我是使用火山引擎,可使用其他支持的模型服务,如deepseek、Gemini
# api_version: 2024-05-01-preview
auth_type: api_key # or azure_managed_identity
api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file,.env文件中替换成你的大模型密钥
# audience: "https://cognitiveservices.azure.com/.default"
# organization: <organization_id>
model: deepseek-v3-250324
# deployment_name: <azure_model_deployment_name>
encoding_model: cl100k_base # automatically set by tiktoken if left undefined
model_supports_json: true # recommended if this is available for your model.
concurrent_requests: 25 # max number of simultaneous LLM requests allowed
async_mode: threaded # or asyncio
retry_strategy: native
max_retries: -1 # set to -1 for dynamic retry logic (most optimal setting based on server response)
tokens_per_minute: 0 # set to 0 to disable rate limiting
requests_per_minute: 0 # set to 0 to disable rate limiting
设置embedding模型API密钥:
yaml default_embedding_model:
type: openai_embedding # or azure_openai_embedding
api_base: https://ark.cn-beijing.volces.com/api/v3
# api_version: 2024-05-01-preview
auth_type: api_key # or azure_managed_identity
api_key: ${GRAPHRAG_API_KEY}
# audience: "https://cognitiveservices.azure.com/.default"
# organization: <organization_id>
model: doubao-embedding-large-text-240915
# deployment_name: <azure_model_deployment_name>
encoding_model: cl100k_base # automatically set by tiktoken if left undefined
model_supports_json: true # recommended if this is available for your model.
concurrent_requests: 25 # max number of simultaneous LLM requests allowed
async_mode: threaded # or asyncio
retry_strategy: native
max_retries: -1 # set to -1 for dynamic retry logic (most optimal setting based on server response)
tokens_per_minute: 0 # set to 0 to disable rate limiting
requests_per_minute: 0 # set to 0 to disable rate limiting
将需要处理的txt或其他文本文件放入./graphrag/input
目录中。
bash# 使用LazyGraphRAG模式构建索引
graphrag index --root ./graphrag --method fast
bash# 使用LazyGraphRAG进行查询
graphrag query --root ./graphrag --method local --query "国家基本公共卫生服务规范(第三版)的主要内容是什么?"
GraphRAG支持将生成的知识图谱导出到Neo4j进行可视化:
编辑./graphrag/settings.yml
文件,添加Neo4j连接信息:
yamlneo4j:
uri: "bolt://localhost:7687"
username: "neo4j"
password: "your_password" # 替换为你的Neo4j密码
bash# 使用提供的脚本导出到Neo4j
cd graphrag/utils
python graph_visual_with_neo4j.py
编辑配置文件以调整LazyGraphRAG的相关性测试预算,平衡成本与质量。
GraphRAG 0.5.0版本支持增量更新索引,可以在新增文档后只索引新的内容:
bashgraphrag index --root ./graphrag --method fast --incremental
pip install -r requirements.txt
安装所有依赖要将LazyGraphRAG与MCP(Model Control Protocol)集成,需要创建一个API服务,可以使用FastAPI框架:
安装FastAPI和相关依赖:
bashpip install fastapi uvicorn pydantic
创建API服务(示例代码):
pythonlogging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
filename='graphrag_api.log',
filemode='a')
logger = logging.getLogger('graphrag_api')
# 尝试加载.env文件
try:
from dotenv import load_dotenv
env_path = Path(os.path.dirname(os.path.abspath(__file__))) / '.env'
if env_path.exists():
logger.info(f"从文件加载环境变量: {env_path}")
load_dotenv(dotenv_path=env_path)
else:
logger.warning(f"未找到.env文件: {env_path}")
except ImportError:
logger.warning("python-dotenv未安装,仅使用系统环境变量")
# 配置GraphRAG路径
GRAPHRAG_ROOT = os.getenv("GRAPHRAG_ROOT", os.path.expanduser("~/graphrag-project/graphrag"))
# 创建FastAPI应用
app = FastAPI(
title="GraphRAG API",
description="GraphRAG知识库API服务",
version="1.0.0")
# 添加CORS中间件
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],)
# 请求模型
class QueryRequest(BaseModel):
query: str = Field(..., description="查询问题")
domain: str = Field("general", description="查询领域")
method: str = Field("local", description="查询方法")
# 响应模型
class QueryResponse(BaseModel):
response: str = Field(..., description="查询结果")
sources: Optional[List[Dict[str, Any]]] = Field(None, description="引用来源")
@app.get("/health", tags=["系统"])
async def health_check():
"""检查API健康状态"""
try:
# 简单检查GRAPHRAG_ROOT目录是否存在
graphrag_path = Path(GRAPHRAG_ROOT)
if not graphrag_path.exists():
return {
"status": "warning",
"message": f"GraphRAG路径不存在: {GRAPHRAG_ROOT}",
"version": "1.0.0"
}
return {
"status": "ok",
"version": "1.0.0",
"graphrag_root": GRAPHRAG_ROOT
}
except Exception as e:
logger.error(f"健康检查异常: {e}")
raise HTTPException(status_code=500, detail=f"服务器错误: {str(e)}")
@app.post("/query", response_model=QueryResponse, tags=["查询"])
async def query(request: QueryRequest):
"""执行GraphRAG查询"""
try:
# 设置环境变量
env = os.environ.copy()
# 构建GraphRAG CLI命令
cli_cmd = [
"graphrag", "query",
"--root", GRAPHRAG_ROOT,
"--method", request.method,
"--query", request.query
]
# 执行命令
logger.info(f"执行查询: {request.query}")
process = subprocess.Popen(
cli_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
stdout, stderr = process.communicate()
# 检查执行结果
if process.returncode != 0:
logger.error(f"查询执行失败: {stderr}")
# 返回基本的错误
return {
"response": f"查询执行遇到问题。",
"sources": []
}
# 解析输出
response_text = stdout.strip()
return {
"response": response_text,
"sources": []
}
except Exception as e:
logger.error(f"查询异常: {e}")
# 返回基本的错误
return {
"response": f"查询处理遇到异常。",
"sources": []
}
@app.post("/medical", response_model=QueryResponse, tags=["医疗查询"])
async def medical_query(request: QueryRequest):
"""医疗领域专业知识查询"""
request.domain = "medical"
return await query(request)
@app.post("/ml", response_model=QueryResponse, tags=["机器学习查询"])
async def ml_query(request: QueryRequest):
"""机器学习领域专业知识查询"""
request.domain = "ml"
return await query(request)
if __name__ == "__main__":
port = int(os.getenv("GRAPHRAG_API_PORT", "8000"))
host = os.getenv("GRAPHRAG_API_HOST", "0.0.0.0")
logger.info(f"启动GraphRAG API服务器 {host}:{port}")
uvicorn.run(app, host=host, port=port)
创建MCP服务端(示例代码):
python# 设置日志配置
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
filename='graphrag_mcp.log',
filemode='a')
logger = logging.getLogger('graphrag_mcp')
# 尝试加载.env文件
try:
from dotenv import load_dotenv
env_path = Path(os.path.dirname(os.path.abspath(__file__))) / '.env'
if env_path.exists():
logger.info(f"从文件加载环境变量: {env_path}")
load_dotenv(dotenv_path=env_path)
else:
logger.warning(f"未找到.env文件: {env_path}")
except ImportError:
logger.warning("python-dotenv未安装,仅使用系统环境变量")
# 导入GraphRAG客户端
from graphrag.client import GraphRAGClient
# 创建API客户端
client = GraphRAGClient()
# 创建MCP服务器
mcp = FastMCP(
"GraphRAG知识库",
dependencies=["aiohttp"],
description="提供GraphRAG知识库访问的MCP服务器")
@mcp.tool()
async def graphrag_query(query_text: str, domain: str = "general", method: str = "local") -> str:
"""
通过GraphRAG查询知识库
Args:
query_text: 用户查询问题
domain: 查询领域,如general、medical、ml等
method: 查询方法,local(当地知识)、global(全球知识)等
"""
response = await client.query(query_text, domain, method)
logger.info(f"查询结果: {response[:100]}..." if len(response) > 100 else f"查询结果: {response}")
return response
@mcp.tool()
async def graphrag_medical(query_text: str) -> str:
"""
医疗领域专业知识查询
Args:
query_text: 医疗领域查询问题
"""
response = await client.medical_query(query_text)
logger.info(f"医疗查询结果: {response[:100]}..." if len(response) > 100 else f"医疗查询结果: {response}")
return response
@mcp.tool()
async def graphrag_ml(query_text: str) -> str:
"""
机器学习领域专业知识查询
Args:
query_text: 机器学习领域查询问题
"""
response = await client.ml_query(query_text)
logger.info(f"机器学习查询结果: {response[:100]}..." if len(response) > 100 else f"机器学习查询结果: {response}")
return response
@mcp.tool()
async def graphrag_health() -> str:
"""检查GraphRAG API服务健康状态"""
status = await client.health_check()
logger.info(f"健康检查结果: {status}")
return f"GraphRAG API服务状态: {status}"
# 资源端点
@mcp.resource("http://localhost:8000/status")
async def get_api_status() -> str:
"""获取GraphRAG API状态信息"""
status = await client.health_check()
return f"""
GraphRAG API 状态信息:
URL: {client.api_url}
状态: {status.get('status', 'unknown')}
版本: {status.get('version', 'unknown')}
"""
# 主入口
if __name__ == "__main__":
logger.info("启动GraphRAG MCP服务器")
try:
mcp.run()
except Exception as e:
logger.error(f"服务器运行错误: {e}", exc_info=True)
配置MCP
json"graphrag-mcp": {
"command": "./graphrag-mcp/run_server.sh",
"args": []
}
run_server.sh示例代码:
bash#!/bin/bash
# Navigate to the project directory
cd "$(dirname "$0")"
# 尝试使用虚拟环境Python
if [ -f "$(dirname "$0")/lazygraphrag_env/bin/python" ]; then
"$(dirname "$0")/lazygraphrag_env/bin/python" "$(dirname "$0")/server.py" 2> server.log
else
# 如果虚拟环境不存在,使用系统Python
python3 "$(dirname "$0")/server.py" 2> server.log
fi
LazyGraphRAG代表了知识图谱检索技术的重要进步,通过其创新的"延迟处理"策略,大幅降低了部署和使用成本,同时保持了高质量的查询结果。本指南提供了基本的安装和使用步骤仅是个人尝试,更多高级功能可参考官方文档。
本文作者:kyle
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!