使用下面的技術,可以構建不用DL的搜索引擎。
向量搜索引擎使用KD-Tree
KD-Tree 搭建以字符串向量為索引的樹,以 O ( l o g n ) O(logn) O(logn) 的時間復雜度快速查找到最近的向量
代碼來源:https://github.com/zhaozh10/ChatCAD/blob/main/search_engine/src/db.py
代碼如下:
import os
import pickledef save_db_kdtree(path_list,token_names,data,**kwargs):from scipy.spatial import KDTreeif "db_path" in kwargs:db_path=kwargs["db_path"]else:db_path=f"search_engine/db/{kwargs['name']}"if not os.path.splitext(db_path)[1]:db_path+='.pt'# Create a KDTree objectdata=data.toarray()for v in data:if sum(v)!=0: v/=sum(v)tree = KDTree(data,copy_data=True)# Find the 5 nearest neighbors of the first point# distances, indices = tree.query(data[0], k=5)# print(path_list)db={"path_list":path_list,"token_names":token_names,"tree":tree}with open(db_path,'wb') as f:pickle.dump(db,f)def load_db_kdtree(**kwargs):from scipy.spatial import KDTreeif "db_path" in kwargs:db_path=kwargs["db_path"]else:db_path=f"search_engine/db/{kwargs['name']}"if not os.path.splitext(db_path)[1]:db_path+='.pt'# Create a KDTree objectwith open(db_path,'rb') as f:db = pickle.load(f)path_list,token_names,tree=db["path_list"],db["token_names"],db["tree"]# print(path_list)return Query_kdtree(path_list,token_names,tree)# Find the 5 nearest neighbors of the first point# distances, indices = tree.query(data[0], k=5)# db={"path_list":path_list,"token_names":token_names,"tree":tree}class Query_kdtree:def __init__(self,path_list,token_names,tree) -> None:self.path_list,self.token_names,self.tree=path_list,token_names,treedef query(self,feature_vector,k=5):if sum(feature_vector)!=0: feature_vector/=sum(feature_vector)distances, indices=self.tree.query(feature_vector,k,workers=-1)# print(len(self.path_list))return [(self.path_list[pid],distances[i]) for i,pid in enumerate(indices)]save_db=save_db_kdtree
load_db=load_db_kdtree#test pass
if __name__=="__main__":import scipy.sparse as sp# sparse_matrix = sp.lil_matrix((3, 3))save_db_kdtree([114,203],['a','b'],sp.csr_matrix([[.5,.5],[-.5,.5]]),name="try_1")q=load_db_kdtree(name="try_1")print(q.query([1,1],k=2)) # [(114, 0.7071067811865476), (203, 1.5811388300841898)]
使用TD-IDF向量作為一個文檔的feature
當你有一個 (vocabs, document, library) 時,你可以用TD-IDF向量作為document的向量。
當然也可以是 (vocabs, sentence, documents).
一個sentence的TD-IDF向量就是這個句子的所有token的TD-IDF值而已。
代碼如下:
# -*- coding: utf-8 -*-
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np# 示例文本數據(中文)
corpus = ["機器學習是人工智能的一個分支","深度學習是機器學習的子領域","自然語言處理是人工智能的重要方向"
]# 自定義中文分詞函數
def chinese_tokenizer(text):# 使用 jieba 進行中文分詞return list(jieba.cut(text))# 加載自定義詞典(可選)
# jieba.load_userdict("custom_dict.txt") # 如果有自定義詞典,取消注釋# 初始化 TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=chinese_tokenizer, # 使用自定義分詞器stop_words=None, # 可選:設置停用詞(中文需自定義)max_features=1000 # 限制詞匯表大小
)# 計算 TF-IDF 矩陣
tfidf_matrix = vectorizer.fit_transform(corpus)# 獲取詞匯表
vocabulary = vectorizer.get_feature_names_out()# 打印詞匯表
print("詞匯表(token_names):")
print(vocabulary)
print("\n")# 打印 TF-IDF 矩陣的形狀
print("TF-IDF 矩陣形狀 (文檔數, 詞匯表大小):", tfidf_matrix.shape)
print("\n")# 將稀疏矩陣轉換為密集數組(便于查看具體值)
dense_matrix = tfidf_matrix.toarray()# 打印每個文檔的 TF-IDF 向量
for i, doc in enumerate(corpus):print(f"文檔 {i+1}:")print(doc)print("TF-IDF 向量:")print(dense_matrix[i])print("-" * 50)print("\n")# 示例:用 TF-IDF 向量訓練分類模型
# 假設標簽:0 表示機器學習,1 表示自然語言處理
labels = [0, 0, 1]# 訓練邏輯回歸分類器
clf = LogisticRegression()
clf.fit(tfidf_matrix, labels)# 測試新文本
new_text = ["人工智能的未來"]
new_tfidf = vectorizer.transform(new_text)
predicted_label = clf.predict(new_tfidf)[0]print("新文本預測標簽:", predicted_label)