本文将演示用LangChain对Materials Project的数据查询文档的进行RAG检索,并基于RAG检索的内容进行回复。
具体包括如下步骤:
- 检索网页内容
- 对网页文本进行切片
- 对切片后的文本进行向量化并存储
- 测试检索功能
- 测试生成功能
- 简单进行函数封装
1. 检索网页内容
import bs4
from langchain_community.document_loaders import WebBaseLoader
bs4_strainer = bs4.SoupStrainer(class_="flex flex-row") #只保留flex flex-row类的有效文本
loader = WebBaseLoader( #指定目标网页
web_path="https://docs.materialsproject.org/downloading-data/using-the-api/querying-data",
bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load() #加载网页内容
len(docs[0].page_content) #测试加载效果
print(docs[0].page_content[:500])
print(docs[0].page_content)
2. 对网页文本进行切片
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
#每1000个字符切片,重叠200个字符,并添加起始索引
all_splits = text_splitter.split_documents(docs) #进行切片
print(len(all_splits)) #测试切片效果
print(all_splits[0].page_content)
all_splits[2].metadata
3. 对切片后的文本进行向量化并存储
import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("请输入您的OpenAI API密钥: ") #设置API Key
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings()) #进行向量化并存储
4. 测试检索功能
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2}) #设置为相似性检索,检索两个结果
retrieved_docs = retriever.invoke("MPRester是什么") #检索
len(retrieved_docs) #测试检索效果
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].page_content)
5. 测试生成功能
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini") #设置调用的模型
from langchain import hub
prompt = hub.pull("rlm/rag-prompt") #从langchain的hub中获取rag相关提示词
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
# 设置RAG链
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# 测试RAG生成效果
for chunk in rag_chain.stream("介绍一下MPRester(用中文回答)"):
print(chunk, end="", flush=True)
6. 简单进行函数封装
def chat_with_rag(question: str) -> None:
"""
使用RAG系统回答问题,并以流式方式输出结果。
参数:
question (str): 用户的问题
"""
llm = ChatOpenAI(model="gpt-4o-mini")
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
for chunk in rag_chain.stream(question):
print(chunk, end="", flush=True)
# 使用示例
answer = chat_with_rag("介绍一下MPRester(用中文回答)")
print(answer)