?繼續流水賬完這本書,這個案例是打造文字形式的個人知識庫雛形。
create_context_db:
# Milvus Setup Arguments
COLLECTION_NAME = 'text_content_search'
DIMENSION = 2048
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"# Inference Arguments
BATCH_SIZE = 128from pymilvus import MilvusClient,utility,connections
milvus_client = MilvusClient(uri="http://localhost:19530")# Connect to the instance
connections.connect(host=MILVUS_HOST,port=MILVUS_PORT)from markdown_processor import vectorize_segments,split_html_into_segmentstest_embedding = vectorize_segments(split_html_into_segments("<h1>RAG還是挺有意思的!</h1>"))
embedding_dim = len(test_embedding[0]) #原始的test_embedding的len結構是[[],[]]的形式
print(embedding_dim)
print(test_embedding[:10])# Remove any previous collection with the same name
if utility.has_collection(COLLECTION_NAME):utility.drop_collection(COLLECTION_NAME)milvus_client.create_collection(collection_name=COLLECTION_NAME,dimension=embedding_dim,metric_type="IP", # Inner product distanceconsistency_level="Strong", # Supported values are (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.
)#下面這個手法可以直接讀取md文件,然后向量化存系統。
#from tqdm import tqdm
#data = []
#from glob import glob
#text_lines = []
#for file_path in glob("milvus_docs/en/faq/*.md", recursive=True):
# with open(file_path, "r") as file:
# file_text = file.read()
# text_lines += file_text.split("# ")
#
#for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
# data.append({"id": i, "vector": vectorize_segments(split_html_into_segments(line)), "text": line})
#
#milvus_client.insert(collection_name=COLLECTION_NAME, data=data)
markdown_processor.py 這個文件如今大可不必了。
import markdown
from bs4 import BeautifulSoup #用于解析和操作HTML文檔
from transformers import AutoTokenizer,AutoModel #用于自動加載預訓練的模型以及分詞器
import torch #用于深度學習計算def markdown_to_html(markdown_text):return markdown.markdown(markdown_text)def split_html_into_segments(html_text): #定義函數,將HTML文檔分割成多個段落soup = BeautifulSoup(html_text,"html.parser") #解析HTML文檔segments = [] #初始化一個列表用于存儲分割后的段落#找HTML文檔中的段落,標題,無序列表和有序列表標簽for tag in soup.find_all(["h1","h2","h3","h4","h5","h6","p","ul","ol"]):segments.append(tag.get_text())return segments#定義函數,用于將文本段落轉換為向量表示
def vectorize_segments(segments):# 使用預訓練的分詞器和模型,這里使用的是BAAI/bge-large-zh-v1.5 一個中文模型tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-zh-v1.5")model = AutoModel.from_pretrained("BAAI/bge-large-zh-v1.5")model.eval() #將模型定位評估模式,避免dropout等訓練模式下的參數#使用分詞器對文本段落進行編碼,添加必要的填充和截斷,并返回PyTorch張量格式encoded_input = tokenizer(segments,padding=True,truncation=True,return_tensors="pt")with torch.no_grad():model_output = model(**encoded_input) #將編碼后的輸入傳遞給模型,獲取模型的輸出sentence_embeddings = model_output[0][:,0] #從模型輸出中提取句子向量化的結果#對句子的量化結果進行L2歸一化,以便于后續的相似度比較或聚類分析sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings,p=2,dim=1)return sentence_embeddings
flask的app.py
from flask import Flask,request,jsonify
from flask import render_template
import requests
from markdown_processor import markdown_to_html, split_html_into_segments, vectorize_segments
from pymilvus import MilvusClientimport logging
import osMILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = 'text_content_search'
TOP_K = 3app = Flask(__name__)
milvus_client = MilvusClient(uri="http://localhost:19530")@app.route("/")
def index():return render_template("index.html")@app.route('/upload', methods=['POST'])
def upload():if 'file' not in request.files:return jsonify({'error': 'No file part in the request'}), 400file = request.files['file']if file.filename == '':return jsonify({'error': 'No file selected for uploading'}), 400markdown_text = file.read().decode('utf-8')html_text = markdown_to_html(markdown_text)segments = split_html_into_segments(html_text)vectors = vectorize_segments(segments)# 將向量上傳到數據庫data = []for i, (segment, vector) in enumerate(zip(segments, vectors)):data.append({"id": i + 1,"vector": vector.tolist(), "text": segment})milvus_client.insert(collection_name=COLLECTION_NAME, data=data)return jsonify({'message': '文件已處理并上傳向量到數據庫'})@app.route('/search', methods=['POST'])
def search():data = request.get_json()search_text = data.get('search')# 添加前綴到查詢字符串instruction = "為這個句子生成表示以用于檢索相關文章:"search_text_with_instruction = instruction + search_text# 向量化修改后的查詢search_vector = vectorize_segments([search_text_with_instruction])[0].tolist()search_results = milvus_client.search(collection_name=COLLECTION_NAME,data=[search_vector], limit=3, # Return top 3 resultssearch_params={"metric_type": "IP", "params": {}}, # Inner product distanceoutput_fields=["text"], # Return the text field) # 構建與 LLM API 交互的消息列表messages = [{"role": "system", "content": "You are a helpful assistant. Answer questions based solely on the provided content without making assumptions or adding extra information."}] # 解析搜索結果for index,value in enumerate(search_results):#print(value)text = value[0]["entity"]["text"]print(text)messages.append({"role": "assistant", "content": text})messages.append({"role": "user", "content": search_text})# 向 deepseek 發送請求并獲取答案 (用的silicon flow)url = "https://api.ap.siliconflow.com/v1/chat/completions"payload = {"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B","messages": messages,"stream": False,"max_tokens": 1000,"stop": None,"temperature": 0.7,"top_p": 0.7,"top_k": 10,"frequency_penalty": 0.5,"n": 1,"response_format": {"type": "text"},}headers = {"Authorization": "Bearer <#你自己的token>","Content-Type": "application/json"}response = requests.request("POST", url, json=payload, headers=headers)answer = response.textreturn jsonify({'answer': answer})if __name__ == '__main__':app.run(host='0.0.0.0', port=5020, debug=True)
吐槽一下,silicon flow這種deepseek API免費問不到幾個,就開始算錢咯。
?小網站結構,以及其他雜代碼,可以查看以及直接下載:https://www.ituring.com.cn/book/3305?