讓他分別用100*10000個單元格有100、1000、10000個不同的1-200字符長的大寫英文字母字符串測試.
一開始DeepSeek沒有找到啟用sharedStrings.xml的寫xlsx模塊,我自己找了pyxlsbwriter的例子告訴他才改好的。
import os
import time
import random
import string
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import matplotlib.pyplot as plt
import numpy as np
from pyxlsbwriter import XlsxWriterdef generate_random_string(length):"""生成指定長度的大寫英文字母隨機字符串"""return ''.join(random.choices(string.ascii_uppercase, k=length))def create_test_data(rows, cols, unique_strings):"""創建測試數據rows: 行數cols: 列數unique_strings: 不同字符串的數量"""# 生成唯一字符串池string_pool = [generate_random_string(random.randint(1, 200)) for _ in range(unique_strings)]# 創建數據矩陣data = []for i in range(rows):row = []for j in range(cols):# 從字符串池中隨機選擇一個字符串row.append(random.choice(string_pool))data.append(row)return datadef write_excel_with_shared_strings(data, filename):"""使用shared strings寫入Excel文件(使用pyxlsbwriter庫確保生成sharedStrings.xml)"""# 使用XlsxWriter創建xlsx文件,默認會使用shared stringswith XlsxWriter(filename, compressionLevel=6) as writer:writer.add_sheet("Sheet1")writer.write_sheet(data) def write_excel_without_shared_strings(data, filename):"""不使用shared strings寫入Excel文件(轉換為pandas DataFrame再保存)"""df = pd.DataFrame(data)df.to_excel(filename, index=False, header=False)def read_excel(filename):"""讀取Excel文件并測量時間"""start_time = time.time()df = pd.read_excel(filename, header=None)end_time = time.time()return end_time - start_time, dfdef test_scenario(rows, cols, unique_strings, output_dir="test_results"):"""測試一個場景:特定行、列和唯一字符串數量的情況"""if not os.path.exists(output_dir):os.makedirs(output_dir)# 創建測試數據print(f"生成測試數據: {rows}行 x {cols}列, {unique_strings}個唯一字符串")data = create_test_data(rows, cols, unique_strings)# 測試啟用shared strings的情況with_shared_file = os.path.join(output_dir, f"with_shared_{rows}_{cols}_{unique_strings}.xlsx")start_time = time.time()write_excel_with_shared_strings(data, with_shared_file)with_shared_write_time = time.time() - start_timewith_shared_read_time, with_shared_df = read_excel(with_shared_file)with_shared_size = os.path.getsize(with_shared_file)# 測試不啟用shared strings的情況without_shared_file = os.path.join(output_dir, f"without_shared_{rows}_{cols}_{unique_strings}.xlsx")start_time = time.time()write_excel_without_shared_strings(data, without_shared_file)without_shared_write_time = time.time() - start_timewithout_shared_read_time, without_shared_df = read_excel(without_shared_file)without_shared_size = os.path.getsize(without_shared_file)# 驗證數據一致性assert with_shared_df.equals(without_shared_df), "兩種方式保存的數據不一致!"# 返回結果return {'rows': rows,'cols': cols,'unique_strings': unique_strings,'with_shared_write_time': with_shared_write_time,'with_shared_read_time': with_shared_read_time,'with_shared_size': with_shared_size,'without_shared_write_time': without_shared_write_time,'without_shared_read_time': without_shared_read_time,'without_shared_size': without_shared_size}def main():"""主函數"""# 測試配置rows = 100cols = 10000unique_strings_list = [100, 1000, 10000, 100000]results = []# 運行測試for unique_strings in unique_strings_list:result = test_scenario(rows, cols, unique_strings)results.append(result)# 打印當前測試結果print(f"\n測試結果 (唯一字符串數: {unique_strings}):")print(f"啟用shared strings - 寫入時間: {result['with_shared_write_time']:.2f}s, "f"讀取時間: {result['with_shared_read_time']:.2f}s, "f"文件大小: {result['with_shared_size']/1024/1024:.2f}MB")print(f"禁用shared strings - 寫入時間: {result['without_shared_write_time']:.2f}s, "f"讀取時間: {result['without_shared_read_time']:.2f}s, "f"文件大小: {result['without_shared_size']/1024/1024:.2f}MB")# 繪制結果圖表plot_results(results)def plot_results(results):from pylab import mpl# 設置顯示中文字體mpl.rcParams["font.sans-serif"] = ["SimSun"]# 設置正常顯示符號mpl.rcParams["axes.unicode_minus"] = False"""繪制測試結果圖表"""unique_strings = [r['unique_strings'] for r in results]# 創建圖表fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))# 寫入時間對比with_shared_write_times = [r['with_shared_write_time'] for r in results]without_shared_write_times = [r['without_shared_write_time'] for r in results]ax1.plot(unique_strings, with_shared_write_times, 'o-', label='啟用shared strings')ax1.plot(unique_strings, without_shared_write_times, 'o-', label='禁用shared strings')ax1.set_xlabel('唯一字符串數量')ax1.set_ylabel('寫入時間 (秒)')ax1.set_title('寫入時間對比')ax1.legend()ax1.grid(True)# 讀取時間對比with_shared_read_times = [r['with_shared_read_time'] for r in results]without_shared_read_times = [r['without_shared_read_time'] for r in results]ax2.plot(unique_strings, with_shared_read_times, 'o-', label='啟用shared strings')ax2.plot(unique_strings, without_shared_read_times, 'o-', label='禁用shared strings')ax2.set_xlabel('唯一字符串數量')ax2.set_ylabel('讀取時間 (秒)')ax2.set_title('讀取時間對比')ax2.legend()ax2.grid(True)# 文件大小對比with_shared_sizes = [r['with_shared_size']/1024/1024 for r in results]without_shared_sizes = [r['without_shared_size']/1024/1024 for r in results]ax3.plot(unique_strings, with_shared_sizes, 'o-', label='啟用shared strings')ax3.plot(unique_strings, without_shared_sizes, 'o-', label='禁用shared strings')ax3.set_xlabel('唯一字符串數量')ax3.set_ylabel('文件大小 (MB)')ax3.set_title('文件大小對比')ax3.legend()ax3.grid(True)# 總時間對比with_shared_total_times = [r['with_shared_write_time'] + r['with_shared_read_time'] for r in results]without_shared_total_times = [r['without_shared_write_time'] + r['without_shared_read_time'] for r in results]ax4.plot(unique_strings, with_shared_total_times, 'o-', label='啟用shared strings')ax4.plot(unique_strings, without_shared_total_times, 'o-', label='禁用shared strings')ax4.set_xlabel('唯一字符串數量')ax4.set_ylabel('總時間 (秒)')ax4.set_title('總時間 (寫入+讀取) 對比')ax4.legend()ax4.grid(True)plt.tight_layout()plt.savefig('shared_strings_performance_comparison.png', dpi=300)plt.show()# 打印詳細結果表格print("\n詳細測試結果:")print("唯一字符串數 | 啟用shared寫入時間 | 禁用shared寫入時間 | 啟用shared讀取時間 | 禁用shared讀取時間 | 啟用shared文件大小 | 禁用shared文件大小")for r in results:print(f"{r['unique_strings']:>12} | {r['with_shared_write_time']:>17.2f} | {r['without_shared_write_time']:>17.2f} | "f"{r['with_shared_read_time']:>17.2f} | {r['without_shared_read_time']:>17.2f} | "f"{r['with_shared_size']/1024/1024:>17.2f} | {r['without_shared_size']/1024/1024:>17.2f}")if __name__ == "__main__":main()
執行結果
生成測試數據: 100行 x 10000列, 100個唯一字符串測試結果 (唯一字符串數: 100):
啟用shared strings - 寫入時間: 0.91s, 讀取時間: 2.70s, 文件大小: 1.60MB
禁用shared strings - 寫入時間: 10.37s, 讀取時間: 9.21s, 文件大小: 12.16MB
生成測試數據: 100行 x 10000列, 1000個唯一字符串測試結果 (唯一字符串數: 1000):
啟用shared strings - 寫入時間: 0.89s, 讀取時間: 2.94s, 文件大小: 2.24MB
禁用shared strings - 寫入時間: 11.71s, 讀取時間: 9.52s, 文件大小: 53.34MB
生成測試數據: 100行 x 10000列, 10000個唯一字符串測試結果 (唯一字符串數: 10000):
啟用shared strings - 寫入時間: 0.85s, 讀取時間: 2.97s, 文件大小: 3.29MB
禁用shared strings - 寫入時間: 12.11s, 讀取時間: 9.60s, 文件大小: 64.52MB
生成測試數據: 100行 x 10000列, 100000個唯一字符串測試結果 (唯一字符串數: 100000):
啟用shared strings - 寫入時間: 3.10s, 讀取時間: 3.77s, 文件大小: 9.54MB
禁用shared strings - 寫入時間: 12.65s, 讀取時間: 9.96s, 文件大小: 66.06MB
結果顯示禁用shared strings時寫入更慢且文件更大,而啟用時文件更小且寫入更快,隨著唯一字符串數量增加,啟用shared strings的效率下降,不啟用沒有變化。驗證了shared strings在重復字符串場景下的存儲優化效果。