NLP---IF-IDF案例分析

一·案例 - 紅樓夢

1首先準備語料庫

http://www.dxsxs.com

這個網址去下載

2 任務一：拆分提取

import os
import redef split_hongloumeng():# ========== 1. 配置路徑（關鍵：根據實際文件位置修改） ==========# 腳本所在文件夾（自動獲取，不用改）script_dir = os.path.dirname(os.path.abspath(__file__))# 紅樓夢原文路徑（和腳本同文件夾就不用改，否則寫完整路徑，如 D:/xxx/紅樓夢.txt）input_path = os.path.join(script_dir, "紅樓夢.txt")# 切割后保存的文件夾（自動在腳本目錄創建“分卷”文件夾）output_dir = os.path.join(script_dir, "分卷")os.makedirs(output_dir, exist_ok=True)  # 確保輸出文件夾存在# ========== 2. 正則規則（精準匹配要過濾/切割的內容） ==========# 過濾開頭無關信息（手機電子書...本章字數:xxx）header_pattern = re.compile(r'手機電子書·大學生小說網.*?本章字數:\d+', re.DOTALL)# 匹配回目（第一回、第二回...），優先匹配“第X回”，適配不同寫法chapter_pattern = re.compile(r'第([一二三四五六七八九十百千萬]+回|[\d]+回)', re.UNICODE)# 過濾結尾無關內容（且聽下回分解及之后空行）end_pattern = re.compile(r'且聽下回分解.*?$', re.DOTALL)with open(input_path, 'r', encoding='utf-8') as f:# 讀取全文 → 過濾開頭無關內容 → 按行處理content = f.read()# 先砍頭：去掉開頭無關信息content = header_pattern.sub('', content).strip()# 按換行拆分，方便逐行處理lines = content.split('\n')current_chapter = None  # 當前回目名稱（如“第一回”）current_lines = []      # 當前回的內容chapter_order = []      # 記錄回目順序，保證輸出按1、2、3回排序# ========== 3. 逐行處理原文 ==========for line in lines:line = line.strip()  # 去掉每行首尾空格、換行符if not line:         # 空行直接跳過continue# 匹配回目（如“第一回”“第2回”，兼容中文數字和阿拉伯數字）chapter_match = chapter_pattern.search(line)if chapter_match:# ---- 遇到新回目，先保存上一回內容 ----if current_chapter:# 去結尾無關內容（且聽下回分解...）clean_content = end_pattern.sub('', '\n'.join(current_lines)).strip()# 保存文件（用回目編號排序，如“001_第一回.txt”）output_path = os.path.join(output_dir,f"{str(len(chapter_order)+1).zfill(3)}_{current_chapter}.txt")with open(output_path, 'w', encoding='utf-8') as f_out:f_out.write(clean_content)chapter_order.append(current_chapter)  # 記錄順序# ---- 開始處理新回目 ----current_chapter = chapter_match.group(0)  # 提取回目名稱（如“第一回”）current_lines = [current_chapter]  # 回目名稱作為第一行else:# 非回目行，加入當前回內容（已過濾空行，直接存）current_lines.append(line)# ========== 4. 處理最后一回（循環外收尾） ==========if current_chapter:clean_content = end_pattern.sub('', '\n'.join(current_lines)).strip()output_path = os.path.join(output_dir,f"{str(len(chapter_order)+1).zfill(3)}_{current_chapter}.txt")with open(output_path, 'w', encoding='utf-8') as f_out:f_out.write(clean_content)# ========== 5. 完成提示 ==========print(f"? 切割完成！共 {len(chapter_order) + (1 if current_chapter else 0)} 回")print(f"📁 保存位置：{output_dir}")print("🔍 文件名按【001_第一回、002_第二回...】排序，可直接用")if __name__ == "__main__":split_hongloumeng()

任務二·把分好后的卷，轉移成IF-IDF能識別的卷

#
import pandas as pd  # 數據預處理庫
import os  # 用于文件和目錄操作
import jieba  # 用于中文分詞# 獲取當前腳本所在的目錄路徑
current_dir = os.path.dirname(os.path.abspath(__file__))# 初始化列表，用于存儲文件路徑和文件內容
filePaths = []  # 保存文件路徑
fileContents = []  # 保存文件路徑對應的內容# 遍歷文件夾，獲取文件路徑和內容
# 使用絕對路徑拼接，確保能正確找到分卷文件夾
fenjuan_dir = os.path.join(current_dir, "分卷")
for root, dirs, files in os.walk(fenjuan_dir):  # 遍歷文件夾及其子文件夾for name in files:filePath = os.path.join(root, name)  # 拼接得到文件完整路徑filePaths.append(filePath)  # 將文件路徑添加到列表# 讀取文件內容并添加到列表with open(filePath, 'r', encoding='utf-8') as f:fileContent = f.read()fileContents.append(fileContent)# 將文件路徑和內容轉換為DataFrame
corpos = pd.DataFrame({'filePath': filePaths,'fileContent': fileContents
})# 導入紅樓夢專屬詞庫，提升分詞準確性
# 紅樓夢詞庫與腳本在同一目錄下
user_dict_path = os.path.join(current_dir, "紅樓夢詞庫.txt")
jieba.load_userdict(user_dict_path)# 讀取停用詞庫，用于過濾無關詞匯
# 修正路徑，假設StopwordsCN.txt在當前腳本所在的紅樓夢目錄下
stopwords_path = os.path.join(current_dir, "StopwordsCN.txt")
stopwords = pd.read_csv(stopwords_path, encoding='utf8', engine='python', index_col=False)# 創建新文件，用于保存分詞后結果
output_file = os.path.join(current_dir, "wj.txt")
file_to_jieba = open(output_file, 'w', encoding='utf-8')# 遍歷DataFrame，對每個文件內容進行分詞和停用詞過濾
for index, row in corpos.iterrows():  # 按行遍歷DataFramejuan_ci = ''  # 用于存儲當前文件分詞后的結果fileContent = row['fileContent']  # 獲取當前文件內容segs = jieba.cut(fileContent)  # 進行分詞for seg in segs:# 過濾停用詞和空字符串if seg not in stopwords.stopword.values and len(seg.strip()) > 0:juan_ci += seg + ' '  # 拼接分詞結果file_to_jieba.write(juan_ci + '\n')  # 將結果寫入文件# 關閉文件
file_to_jieba.close()

1. 導入所需庫

python

運行

import pandas as pd  # 數據預處理庫
import os  # 用于文件和目錄操作
import jieba  # 用于中文分詞

導入pandas庫，用于數據的結構化處理（如創建 DataFrame）
導入os庫，用于處理文件路徑和目錄遍歷
導入jieba庫，用于中文文本的分詞處理

2. 獲取當前腳本所在目錄

python

運行

current_dir = os.path.dirname(os.path.abspath(__file__))

os.path.abspath(__file__)獲取當前腳本的絕對路徑
os.path.dirname()提取該路徑中的目錄部分
目的是獲取可靠的基準路徑，避免相對路徑帶來的問題

3. 初始化存儲數據的列表

python

運行

filePaths = []  # 保存文件路徑
fileContents = []  # 保存文件路徑對應的內容

創建兩個空列表，分別用于存儲后續讀取的文件路徑和文件內容

4. 遍歷文件夾并讀取文件內容

python

運行

fenjuan_dir = os.path.join(current_dir, "分卷")
for root, dirs, files in os.walk(fenjuan_dir):for name in files:filePath = os.path.join(root, name)filePaths.append(filePath)with open(filePath, 'r', encoding='utf-8') as f:fileContent = f.read()fileContents.append(fileContent)

拼接得到 "分卷" 文件夾的完整路徑
使用os.walk()遍歷 "分卷" 文件夾下的所有文件
對每個文件，拼接完整路徑并添加到filePaths列表
以 UTF-8 編碼打開文件，讀取內容并添加到fileContents列表

5. 創建數據框存儲文件信息

python

運行

corpos = pd.DataFrame({'filePath': filePaths,'fileContent': fileContents
})

使用pandas.DataFrame()創建數據框
將文件路徑和內容分別作為兩列存儲，便于后續按行處理

6. 加載自定義詞庫

python

運行

user_dict_path = os.path.join(current_dir, "紅樓夢詞庫.txt")
jieba.load_userdict(user_dict_path)

拼接得到紅樓夢專屬詞庫的路徑
加載自定義詞庫，讓 jieba 分詞更符合《紅樓夢》的語言特點

7. 讀取停用詞庫

python

運行

stopwords_path = os.path.join(current_dir, "StopwordsCN.txt")
stopwords = pd.read_csv(stopwords_path, encoding='utf8', engine='python', index_col=False)

拼接得到停用詞文件的路徑
讀取中文停用詞表，用于后續過濾無意義詞匯（如 "的"、"了" 等）

8. 準備輸出文件

python

運行

output_file = os.path.join(current_dir, "wj.txt")
file_to_jieba = open(output_file, 'w', encoding='utf-8')

定義分詞結果的輸出文件路徑
以寫入模式打開文件，準備存儲處理后的結果

9. 分詞處理并過濾停用詞

python

運行

for index, row in corpos.iterrows():juan_ci = ''fileContent = row['fileContent']segs = jieba.cut(fileContent)for seg in segs:if seg not in stopwords.stopword.values and len(seg.strip()) > 0:juan_ci += seg + ' 'file_to_jieba.write(juan_ci + '\n')